diff --git a/.gitignore b/.gitignore index 7bdd5cfd8..59df506be 100644 --- a/.gitignore +++ b/.gitignore @@ -1,80 +1,102 @@ *.swp Makefile.in aclocal.m4 autoconf autoheader autom4te.cache automake autoscan.log compile configure configure.scan config.guess config.log config.sub config.status Makefile depcomp install-sh libtoolize ltmain.sh libtool make/stamp-h1 m4 make/clusterautoconfig.h* missing *.pc .deps .libs *.o *.la *.lo *.loT rgmanager/src/resources/fs.sh rgmanager/src/resources/oracledb.sh rgmanager/src/resources/utils/config-utils.sh resource-agents-* .version +# generated by ./autogen.sh && ./configure +heartbeat/ocf-binaries +heartbeat/ocf-directories +heartbeat/ocf-shellfuncs +heartbeat/shellfuncs +include/agent_config.h +include/config.h +include/config.h.in +include/stamp-h1 +include/stamp-h2 +ldirectord/OCF/ldirectord +ldirectord/init.d/ldirectord +ldirectord/init.d/ldirectord.debian +ldirectord/init.d/ldirectord.debian.default +ldirectord/ldirectord +ldirectord/systemd/ldirectord.service +tools/ocf-tester +tools/ocft/README +tools/ocft/README.zh_CN +tools/ocft/caselib +tools/ocft/ocft + *.cache *.upgrade.xml py-compile ylwrap # BEAM Entries *.beam parser-messages MISC_ERRORS cscope.files cscope.out patches updates logs # OS and Editor Artifacts .DS_Store .bomb *.rej *.bz2 *.gz *.xz *.sed *.diff *.patch *.gres *~ # Misc HTML TAGS GPATH GRTAGS GSYMS GTAGS .gres.* *.orig .gdb_history *~ \#* .changes pacemaker.tar.gz diff --git a/.travis.yml b/.travis.yml index 79f775399..e6943fadd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,13 @@ language: bash -install: - - ./ci/install.sh +sudo: false + +addons: + apt: + sources: + - debian-sid + packages: + - shellcheck script: - ./ci/build.sh notifications: email: false -sudo: required diff --git a/ChangeLog b/ChangeLog index 38053273a..371157420 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,736 +1,834 @@ +* Wed Feb 3 2016 resource-agents contributors +- stable release 3.9.7 +- ldirectord: fix unset failcount error +- iscsi: add portal check to open_iscsi_get_session_id() +- galera: use mysql's --tc-heuristic-recover if crash recovery is needed +- nfsserver: fix monitor for systemd + +* Wed Jan 20 2016 resource-agents contributors +- release candidate 3.9.7 rc1 +- nfsserver.sh: add hostname attribute for NFS export (required for NFSv4+Kerberos support) +- oradg.sh: new RA for Oracle Data Guard +- ocf_shellfuncs: suppress bash specific trace_ra log on dash +- sg_persist: remove uncalled for ocf_run calls +- multiple RA: replace error log messages with calls to ocf_exit_reason +- nfsserver: only do redhat specific stuff on redhat +- exportfs: don't increment fsid for single directory +- Filesystem: add tmpfs support +- netfs.sh: move defaults to metadata +- nfsserver: /var/lock/subsys is non-standard, check for it first +- nagios: new RA +- docker: check for errors in the container name +- mysql: fix grep failure on MySQL 5.6 or higher when checking read_only variable +- VirtualDomain: new attributes migration_speed and migration_downtime +- fs: remove not-working tmpfs support +- vm.sh: add migrate_options parameter +- nfsserver: Use rpc-statd.service for NFS locking in EXEC_MODE=3 (bsc#955114) +- nfsserver: Add EXEC_MODE for systemd without nfs-lock.service (bsc#955114) +- IPaddr2: Add IPv6 DAD collision detection +- Filesystem: add overlay as supported filesystem +- ldirectord: dns_check and fallbackcommand enhancements +- IPaddr2: fix potential syntax error on if-then-else +- SAPDatabase: add Oracle 12 to list of supported databases (bsc#953991) +- mysql-common.sh: fix issue where "removing old PID file" wasnt logged +- mysql-common.sh: when mysql has been stopped, mysql stop returns success +- mysql.sh: wait up to startup_wait seconds before failing if mysqld startup is slow +- orainstance.sh: fix 90s wait/killing of databases containing the name of the database being killed, and added cleanup code to kill remaining listener process +- ip.sh: Use DAD to check for IPv6 address collision +- iSCSITarget: fix to only create one IQN and add portals to it +- galera: document the bootstrap flow +- galera: start joining nodes during 'monitor' to allow long-running SST +- galera: add support for MYSQL_HOST and MYSQL_PORT from /etc/sysconfig/clustercheck +- redis: fix password parser +- pgsql fix exec_sql errors like "unknown variable select pg_ " in dash +- pgsql: fix get_my_location() sql regression +- docker: fix image variable name +- pgsql: Fix return code override in pgsql_real_start() +- slapd: add "maxfiles" parameter to set max number of open files (for ulimit -n) +- redis: use required client password when set +- send_arp: fix for infiniband, re-merge from upstream iputils arping +- CTDB: Preserve smb.conf permissions (bsc#935253) +- lxc: fix emergency stop functionality on 1.0 +- tomcat: use runuser instead of su for SELinux enforcing mode +- pgsql: use runuser intead of su command for SELinux enforcing mode +- docker: image name check fixes +- iSCSITarget: properly create portals for lio-t implementation +- iSCSILogicalUnit: when deleting a LUN or initiator fails with lio-t, proceed with warning +- iSCSILogicalUnit: return OCF_NOT_RUNNING on monitor if backing path does not exist +- iSCSILogicalUnit: add check for leftover target/core entries for lio-t +- pgsql: delete old replication slot when creating a new slot. +- Filesystem: support RozoFS +- orainstance.sh: interpret listener stop results correctly +- dhcpd: use correct default chroot for RHEL based systems +- LVM: allow vgck failures if partial_activation is true +- redis: avoid 0 byte dump.rdb start failures +- docker: fix container_exist test +- redis: fixed start operation if replication sync takes > 20 seconds +- ethmonitor: add link_status_only option for skipping RX counter and arping tests +- clvm: fix issue with only first option of daemon_options being used +- IPsrcaddr: return correct error code during stop when misconfigured +- clvm: activate_vgs option for enable/disable of automatic vg activation +- galera: properly redetect bootstrap after demote +- galera: clear last know sequence number any time promote is even attempted +- asterisk: fix return code +- galera: retrieve last sequence number without using read-only mode +- redis: add wait_last_known_master option +- redis: only connect to active master instances +- redis: do not attempt to demote if redis is dead +- redis: reliable shutdown. +- pgsql: add support for replication slots +- redis: set executable bit to be able to greate docs (make rpm) +- rabbitmq-cluster: fix rmq_join_list() to only return online nodes +- rabbitmq-cluster: new RA +- Filesystem: support overlayfs +- sg_persist: use default binary setting in meta-data +- dnsupdate: use nsupdate_opts parameter +- nfsserver: merge options into existing /etc/sysconfig/nfs +- portblock: portno param can be a string like 137,138 +- portblock: replace ancient heartbeat config with crm configure +- portblock: clarify TCP RST vs ICMP port unreachable +- VirtualDomain: enforce C locale in force_stop +- redis: retry on unknown error when starting +- redis: remove stop timeout and add placeholder master during election period +- CTDB: Change default socket location to CTDB's expected default. +- multiple RA: make sure that the pidfile directory exist +- multiple RA: create state-directory writable by the application +- orainstance.sh: Handle ORA-* error messages +- redis: new RA + * Thu Jan 29 2015 resource-agents contributors - stable release 3.9.6 - VirtualDomain: add migrate_options parameter - VirtualDomain: enforce C locale in status - ocf-shellfuncs: add printenv to RA trace - nginx: allow different URLs for level 10 and 20 monitor * Tue Jan 20 2015 resource-agents contributors - release candidate 3.9.6 rc1 - VirtualDomain: add sync_config_on_stop to sync the config to other nodes - mysql.sh: Allow MySQL to run as user other than mysql - fs-lib.sh: Fix missed detection of write failure - iscsi: run iscsi discovery only when necessary - VirtualDomain: save the config before virsh undefine (bnc#891915) - sg_persist: new RA - ldirectord: Add Install section for systemd unit file. - kamilio: new RA - findif.sh: Use the most specific matching route - LVM: volgrpname is unique (meta-data update) - ldirectord: Get correct user for sending email (bnc#910497) - iSCSILogicalUnit: do not set write_back when creating backstore/block - portblock: Feature: reset_tcp_on_unblock_stop - nfsserver: prevent error messages on platforms without rpcuser - Xen: fix regression with xm and quoting (lf#2671) - lxc: re-add LXCpre1.0 logic - lxc: fix for missing lxc-ps command - Route: some unique attributes are not unique (meta-data) - IPaddr2 findif: accept dotted quad netmask - VirtualDomain: For Xen, prefer xl to xen-list (boo#901453) - Xen: Use xl list $domain return code in status check (boo#901453) - pgsql: PostgreSQL 9.3 compatibility for unix_socket_directories - lvm_by_vg.sh: Only strip tag on stop if we are owner - ocf-shellfuncs: set HA_LOGD depending on HA_use_logd - mysql: avoid use of check_binary in common validation function. - mysql: report error when validation fails during monitor yet pid is still active - docker: new RA - mysql: do not report success on 'stop' if validation fails - anything: fix output redirection - mysql: fix unexpected operation error that caused by MySQL client timeout. - Middle: anything: Prevent stop failure, even if the job takes time to stop. - doc: Add pcs to man page example section - multiple RA: exit reason support - Build: add --compat-habindir option for HA_BIN backward compatibility - iSCSITarget: monitor operation must pass before start is considered complete - iSCSILogicalUnit: monitor operation must pass before start is considered complete - iSCSILogicalUnit: Fixes the check for pre-existing LUN ACLs - iSCSITarget: fixes syntax error caused by targetcli update - Filesystem: when loading kernel modules wait for filesystem to initialize - ethmonitor: add infiniband status monitoring support - Xinetd: refine exit codes when xinetd is stopped - Xen: Properly quote domain name in call to create - Xen: Use xenstore-ls to get status if available - Xen: Replace use of xm with xl (bnc#882548) - oracle: fix setting monitor user profile for other languages (nls) - apache: Revised fix for init script reference on SUSE (bnc#884674) - VirtualDomain: Try xenstore-ls if no emulator is set (bnc#885292) - galera: new RA - VirtualDomain: Add support for qemu-dm as emulator (bnc#885292) - CTDB: add ctdb_rundir parameter and create on startup - VirtualDomain: new parameter save_config_on_stop to enable saving running VM's configuration - fs-lib.sh: Force kill processes with access to shared libraries on mount point - iSCSILogicalUnit: add targetcli support - Filesystem: Add force_unmount option - nfsserver: regenerate statd state file before starting nfs - nfsnotify: new RA - nfsserver: introducing nfs_no_notify option for suppressing reboot notifications - nfsserver: Keep statd directories synced with ha backup - oracle: reset MONUSR password if expired - oracle: try as sysdba if monitor as MONUSR fails - oracle: Make monitoring user configurable (bnc#850589) - oralistener.sh: Do not fail during stop if listener has already stopped - nfsserver: Add options for explicitly setting nfs daemon ports - vsftpd: new RA - fs-lib.sh: Fix usage of findmnt to work for filesystems containing bind mounts - exportfs: allow multiple exports - ldirectord: Update .spec file with systemd support (bnc#863250) - ldirectord: Add systemd unit file (bnc#863250) - nfsserver: Do not require shared info directory when cloned - nfsserver: Allow dynamically setting rpc.nfsd options. - iscsi: iscsi status fails with open-iscsi with support for flash (bnc#878039) - Med: rgmanager/fs: typo preventing passing some mount opts - nfsserver.sh: add an optional option for the rpc.statd listening port - Feature: addition of bind-mount resource agent for rgmanager - exportfs: Do not fail during monitor and stop operations if dir has trailing slash - nfsserver: Do not fail monitor if lock services are enabled - nfsserver: let systemd service files handle lock notifications - vm.sh: 'no_kill' option for preventing auto destruction of vm after timeout period - vm.sh: Monitor kvm resources without requiring libvirtd to be up - nfsserver: preserve statd directory permissions during sm-notify or else lock recovery fails - kamailio: new RA - nfsserver.sh: On stop, kill sm-notify so /var/lib/nfs/statd can be unmounted - db2: Report OCF_ERR_GENERIC instead of OCF_NOT_RUNNING when instance is not completely up - named.sh: Fix issues running named daemon as a non-root user - db2.sh: rgmanager wrapper for the heartbeat db2 agent - CTDB: do not fail monitor operation when ctdb socket does not exist - exportfs: set unlock_on_stop_default=1 (bnc#864263) - dnsupdate: new RA - clvm: new RA - mysql: handle $secs_behind = NULL - dhcpd: Added a restart-function - high: conntrackd: allow probe to return OCF_RUNNING_MASTER - VirtualDomain: check process table for qemu-system-* too, not just qemu-kvm - VirtualDomain: avoid running "virsh uri" if hypervisor is set - apache.sh: Add ability to set custom httpd binary - db2: Allow db2 agent to work without crm_master binary - VirtualDomain: Attempt to determine vm status even when libvirt is unavailable - VirtualDomain: Fixes parsing domain name from xml file. - fs-lib.sh: Fixes failure to unmount local fs when process runs with cwd inside fs mount - ldirectord: Fix sockaddr_in6 redefined error - Pure-FTPd: Create pid directory if needed - VirtualDomain: support more virsh domstate output formats - varnish: Added support for ulimit -l and ulimit -n - lxc: support up-to-date lxc-ps versions - tomcat: Override default tomcat config with resource options - nfsserver: nfsserver not starting due to missing etab file - tomcat: Avoid unnecessary force kill of tomcat on stop - tomcat: Avoid race condition in reading pid file on stop - slapd: find the correct default slapd config in fedora - tomcat: Detect start script location - pgsql: Support for non-standard port and library locations - fix netfs unmount/self_fence integration - Med: oracledb.sh: Remove quotes around listener name - IPaddr2/findif.sh: Do a sanity check only on start and validate-all - ldirectord: Disable HTTPS SSL certificate hostname checking - tomcat: Monitor rotatelogs process and restart when it is stopped - jboss: Monitor rotatelogs process and restart when it is stopped - mysql: Fix for the issue of detecting an unconfigured slave with empty master_host since setting empty master_host is not allowed with 5.5 - IPaddr2/findif.sh: Robust parameter checking for 'nic' - VirtualDomain: Ensure it is possible to manage a libvirt domain defined outside of VirtualDomain - VirtualDomain: Fix ability to use default libvirt hypervisor without explicitly setting agent attribute - tomcat-6.sh: Fixes setting TOMCAT_USER correctly - tomcat-s.sh: Do not fail on stop if config validation fails. - fs-lib.sh: Removes usage of fuser -kvm from fs-lib.sh based agents. - ldirectord: Use an alarm for LDAP check to ensure it times out - exportfs: stop with no directory should succeed - Xen: retry domain lookup in repeating monitor and stop - Fixes lvm metadata corruption caused when activating by lv using tags. - ldirector: fix using service name instead of port number (bnc#836759) - Filesystem: remove SLES10 compatibility code - Raid1: disallow md raid arrays as clone resources to avoid data corruption - apache: put back config file existence test - VirtualDomain: Do not attempt graceful shutdown if force_stop is enabled - Med: oracledb.sh: Fix process name grep in exit_idle - VirtualDomain: use virsh create instead of define to start it non-persistent - eDir88: multiple IP support - apache: better handling of not installed apache - jboss: add jboss_base_dir parameter to support multi-instances on JBoss 6 - jboss: stop the JBoss daemon by sending a signal on JBoss 6 - jboss: add run_command parameter to change the start up script - jboss: add jboss_version parameter to support JBoss 6 - tomcat: use root as the default for tomcat_user instead of RUNASIS mode - tomcat: multiple tomcat instances based on CATALINE_BASE - pound: add parameter maxfiles to set ulimit - apache: remove unnecessary and imperfect checks from validate_all (bnc#827927) - pgsql: set only one node into sync mode when using 3 nodes or higher - Med: oracledb.sh: Set RESTART_RETRIES back to 0 - Raid1: stop arrays even with block device file missing (bnc#821861) - oracle/oralsnr: use /bin/sh in sudo calls (bnc#825517) - apache: Properly check meta_timeout variable during graceful stop - Raid1: set MDADM_NO_UDEV appropriately if udev is not used - Raid1: wait for udevd to settle (bnc#821861) - iSCSILogicalUnit: add tgt specific parameters bstype, bsoflags, device_type - apache: Attempt graceful stop before -TERM signal - LVM: Warn user if initrd is older than lvm.conf when using exclusive activation with tags - LVM: Exclusive activation without clvmd using filtering with tags. - named: Attempt to autogen /etc/rndc.key using rndc-confgen tool - mysql: really use log setting (bnc#823095) - mysql: test properly for failed process start (bnc#823095) - Med: Don't preserve SELinux context when copying files to /var/lib/nfs/sm - Med: Cleanup oracledb.sh rgmanager agent and add support for Oracle 11g - Med: Cleanup oralistener.sh and add support for Oracle 11g - Med: Cleanup orainstance.sh and add support for Oracle 11g - fs-lib.sh: Faster filesystem start/stop through use of 'findmnt' command for 'is_mounted' function - named.sh: Addition of update-source option. - LVM: Retry exclusive activation after deactivating vg cluster-wide - LVM: Retry deactivating vg allowing udev to settle - ip.sh: Fixes usage of ipv6 addresses with uppercase lettering - LVM: Verify setup on start plus verify exclusive activation is possible. - Route: add IPv6 support - build: Place resource state information in /var/run/... by default - DRBD: remove deprecated drbd agent. - fs-lib.sh: Always honor self_fence option when force_unmount="on" - postgres-8: Shutdown postgres with SIGINT before forcing SIGQUIT - tools: send_arp.libnet: reuse ARP packets (debian#701914) - ethmonitor: correctly detect when the network is unplugged. - Raid1: do not test for device existence in the stop operation (bnc#821861) - mysql: Attempt to auto-detect mysql binary default location. - VirtualDomain: Support saving and restoring virtual machine snapshot state files - nfsserver: Cleanup shared nfs dir mount - nfsserver: Enable file locking daemon when systemd is being used - nfsserver: Maintain SELinux permissions on failover for nfs v3 lock state - nfsserver: Improve support for v3 file lock recovery - nfsserver: Add systemd unit-file support when init scripts are not present - VirtualDomain: Properly detect defined lxc domains * Thu Feb 7 2013 Linux-HA contributors - stable release 3.9.5 - IPaddr2: support nic:iflabel format in nic parameter - VirtualDomain: allow for custom migrateport * Wed Jan 30 2013 Linux-HA contributors - release candidate 3.9.5 rc1 - ocf-shellfuncs: RA tracing - IPaddr2: make sure that some ARP send program runs - pgsql: add check_wal_receiver parameter - pgsql: support starting as Hot Standby - nfsserver: improve rpc.statd support - nfsserver: add option -n for rpc.statd (bnc#794479) - nfsserver: make the retry time for sm-notify configurable - nfsserver: make sm-notify running in the foreground configurable - exportfs: handle '' exportfs embelishment for '*' (bnc#791690) - jboss: set JAVA_OPTS correctly - pound: use correct default for control-binary - pound: No error when stopping pound and no pound is running - Squid: support systems using IPv6 - Xinetd: do not fail in stop if the daemon is not running - Xinetd: improve finding Xinetd process (thanks to Vadym Chepkov) - SendArp: add background parameter - SendArp: fix monitor - ocft: print the actual case names when testing - ocft: make only the updated test-case file - ocft: add "incremental" mode (ocft test -i) - include a copy of LGPL license file * Tue Nov 22 2012 Linux-HA contributors - stable release 3.9.4 - IPaddr2: fix waiting the completion of IPv6 address allocation - zabbixserver: RA for zabbix servers management * Tue Nov 13 2012 Linux-HA contributors - release candidate 3.9.4 rc1 - ocf-rarun: add the RA driver - IPaddr2: use better test for infiniband (bnc#783353) - IPaddr2: replace the findif binary by findif.sh - IPaddr2: add IPv6 support - IPaddr2: really send arps in background if requested - IPaddr2: use send_arp instead of ipoibarping if not available - Filesystem: include ceph in the list of non-blockdev filesystems - Raid1: stop processes using raiddev - Raid1: manage multiple arrays - Raid1: discover block size for dd (bnc#781137) - exportfs: cleanup exportfs cache on stop (bnc#770210) - iscsi: don't fail on stop of the iscsi server fails - iscsi: use iscsiadm -m session -r in stop - iscsi: do discovery in start only - iscsi: check session status in monitor - iscsi: add try_recovery parameter - oracle: ignore password expiry warning in monitor (bnc#770250) - oracle: connect as a regular user in monitor - pgsql: add option recovery_end_command - Xen: repair node_ip_attribute use - SAPDatabase: improvement of cleanup of saphostctrl process - SAPInstance: monitor-master not advertised (bnc#782482) - SAPInstance: don't wait for timeout if the stop attempt failed - SAPInstance: failes on sapcontrol message Unauthorized (bnc#782486) - mysql-proxy: copy in-depth monitoring action from the mysql resource agent - mysql-proxy: add test_table, test_user and test_passwd parameters - mysql-proxy: implement "parameters" parameter - mysql-proxy: add the 'plugins' parameter - mysql-proxy: version specific checks - mysql-proxy: admin plugin auto loading - mysql-proxy: perform SELECT for OCF_CHECK_LEVEL 20 only - mysql-proxy: create pid/socket directories if needed - conntrackd: resync from other hosts in start - slapd: Gracefully handle config check during probe - tomcat: Correction of the time-out level of the stop processing. - tomcat: Correction of the process alive monitoring. - ldirectord: Added READDQUIESCENT parameter - sfex: fix sfex_init for 64-bit big endian platforms - SendArp: exit with the right code when not properly configured - Squid: fix getting PIDs of squid processes (lf#2653) - portblock: use end-of-word instead of space at the end of the line - named: use pgrep instead of searching in ps(1) output - named: fix monitor if named_rootdir is set to / - VIPArip: fix exit codes - VIPArip: make start idempotent - tools: add test-findif.sh - tools: add send_ua binary for IPv6 support in IPaddr2 - ocft: new test for Raid1 * Fri May 25 2012 Linux-HA contributors - stable release 3.9.3 - dhcpd: new RA to manage ISC DHCP servers - Filesystem: add nfs4 to the list of well known types - IPaddr2: fix regression introduce in d93b5fd, nic=lo always "stopped" - iSCSILogicalUnit: correctly match for target IQN and backing device name (iet and tgt) - jboss: implememnt rotating of console log - mysql: improve handling of reset slave - oracle, oralsnr: get rid of eval - slapd: pass bind_dn correctly to ldapsearch * Wed May 16 2012 Linux-HA contributors - release candidate 3.9.3 rc1 - asterisk: new resource agent - named: new RA to manage bind servers - pound: new RA for Pound HTTP/HTTPS reverse-proxy and load-balancer - rsyslog: new RA to manage rsyslog servers - slapd: new RA to manage OpenLDAP servers - varnish: new resource agent - apache: add support for IPv6 in monitor - apache: create /var/run/apache2 if it doesn't exist - apache: fix sysconfig includes & enable status for default SUSE conf - conntrackd: test for socket existence in monitor instead of process grep - conntrackd: rename parameter "conntrackd" to "binary" - CTDB: Add smb_fileid_algorithm parameter (bnc#696978) - CTDB: Improve monitor op (check output of ctdb status, bnc#712192) - CTDB: Set ctdb_start_as_disabled=no by default (bnc#712410, required by samba 3.6) - exportfs: allow expanding the fsid parameter to produce correct exportfs options - exportfs: don't grow /var/lib/nfs/rmtab indefinitely - exportfs: fix monitor action for special characters and common suffixes - Filesystem: add support for glusterfs (lf#2620) - Filesystem: add tmpfs to the list of supported filesystems - Filesystem: allow to force cloning for local mounts - Filesystem: don't use direct dd option in monitor depth 20 for non-blockdevice fs - Filesystem: fix determining if the device is a block device - Filesystem: improve read/write checks for CHECK_LEVEL 10, 20 - Filesystem: repair the fast_stop parameter use (its value was always false) - Filesystem: support ceph - Filesystem: remove a status file only when OCF_CHECK_LEVEL is set to 20 - IPaddr: add back the local_start/stop_script code - IPaddr: remove colon at the end of the interface name - IPv6addr: always use the provided nic and cidr_netmask when specified - IPv6addr: handle a link-local address properly in send_ua - iscsi: do not rely on iscsid.startup being set correctly (bnc#751783) - iscsi: proceed if iscsid is not running if iscsid.startup is present in iscsid.conf - iSCSILogicalUnit: fix default for scsi_sn - iSCSITarget: treat an empty "implementation" parameter specially - jboss: add the java_opts parameter for java options - ldirectord: precedence error with perl v5.8.8 in IPv6 code - LVM: drop vgck(8) from monitor - LVM: force dmevent monitoring for clones - LVM: use ls instead of vgdisplay in status - lxc: fix LXC_status to work with lxc-0.7.5 or later - mysql: improve replication support - mysql: check mysql status more thoroughly before stopping - mysql: fix validation return codes - mysql: support 5.5 slave status message format - nfsserver: Support of multiple IP addresses (bnc#684143) - nfsserver: don't run sm-notify in foreground (bnc#759616) - ocf-shellfuncs: fix loglevel variable scope in ha_log - ocft: new tests for named, IPv6addr, oracle, Xinetd - ocft: several improvements - oracle: improve managing IPC objects - oracle: improve matching instance specific files and processes - pgsql: support for replication - postfix: multiple fixes - Raid1: support for multiple MD arrays, as specified in raidconf - SAPDatabase: add support for Sybase ASE and SAP HANA database - SAPDatabase: correcting the unique values of RAs parameters - SAPDatabase: replace method for checking responsiveness of saphostexec - SAPDatabase: version 2.00 make use of saphostagent - SAPInstance : correcting the unique values of RAs parameters - slapd: always set the exit code correctly in monitor - tomcat: remove pidfile before start, it may prevent some tomcat releases from starting - VirtualDomain: add a functionality that modifies utilization of resource automatically - VirtualDomain: if the configuration file is missing on stop exit with success - VirtualDomain: honor virsh "in shutdown" state - Xen: add support for HVM ACPI graceful shutdown - Xen: wait in migrate_from for the migration to finish instead of bailing out immediately - Tools: findif: Use most specific matching route (bnc#740738) - Tools: send_arp.libnet: fix for big endian platforms (bnc#721334) - doc: add the RA developer's guide * Wed Jun 29 2011 Dejan Muhamedagic and others - stable release 3.9.2 - ethermon: new resource agent - iscsi: fix regression in 3.9.1 for open-iscsi version 2.0-872 (lf#2562) - pgsql: fix regression in 3.9.1 in directories on probes - VirtualDomain: if there's no config exit on stop with success - doc: add sfex_init(8) man page * Wed Jun 15 2011 Dejan Muhamedagic and others - stable release 3.9.1 - ocf-tester: tolerate OCF_ERR_INSTALLED on probes and missing binaries - pgsql: improve configuration check and probe handling * Wed Jun 01 2011 Dejan Muhamedagic and others - release candidate 3.9.1 rc1 - first release since establishing joined repository with RHCS agents - build: new spec file and autoconf to support both agents' sets - build: use ./configure --with-ras-set=linux-ha to configure for heartbeat RA set - build: create compatibility symlinks in autofoo not in spec - build: GNUmakefile removed - lxc: new RA to manage lxc linux containers - symlink: new RA to manage symbolic links - db2: new implementation with master/slave mode - oracle: improve oracle process list test (bnc#673027) - exportfs: backup and restore rmtab to ensure smooth client failover on node failures - CTDB: Allow stop to succeed when using pkill on ctdbd (bnc#695829) - mysql: --skip-slave-start option is default now - mysql: set connect timeout to 10 seconds rather than 1 second - mysql: keep replication state (prevents data loss on master reset) - mysql: don't rely on state information from pacemaker, but check if the instance is in the read-only mode - mysql: if test parameters are all set, assume OCF_CHECK_LEVEL=10 - mysql: support for master/slave for more than two nodes - mysql: don't wait for replication to finish, when not replicating - mysql: store replication state in separate attributes for each master - VirtualDomain: correctly create migration URI when target is an FQDN - VirtualDomain: properly wait until domain_name is non-empty - ldirectord: add a support of "netmask" directive for IPv6 - ldirectord: fix fwmark behavior for IPv6 - ldirectord: ignore children in Net::DNS - iscsi: add support for open-iscsi version 2.0-872 (lf#2562) - postfix: issue error if 'postfix abort' failed - postfix: improve exit codes on installation problems - postfix: use monitor to test if postfix works after the start action - ocft: fix make command for compatibility with mawk/Debian (lf#2600) - ocft: test case for pgsql - ocft: test case for postfix - ocft: test case for iscsi - doc: improve man pages output - doc: add examples for master/slave resource agents * Wed Feb 16 2011 Dejan Muhamedagic and others - stable release 1.0.4 - ocft: testcases for db2, LVM, and Filesystem * Fri Feb 11 2011 Dejan Muhamedagic and others - release candidate 1.0.4 - add GPLv3 license file (bnc#655700) - ocf-shellfuncs: allow ocf_run to return the actual exit code - ocf-shellfuncs: handle properly syslog facility set to none (bnc#621818) - ocf-shellfuncs: correctly identify root by id only (bnc#602312) - RA: add OCF_ROOT/lib/heartbeat directory (development) - RA: set the HA_RSCTMP directory to /var/run/resource-agents (lf#2378) - build: install jboss - conntrackd: new RA - exportfs: new RA - nginx: new RA - fio: new RA for IO load simulation - Filesystem: allow cloning of some filesystems as read-only (lf#2440) - Filesystem: add fast_stop parameter (lf#2402) - Filesystem: Clarify metadata and improve non-clone warning - Filesystem: new run_fsck parameter - LVM: add partial_activation parameter (lf#2490) - IPaddr2: fix reference to Infiniband arping binary (bnc#668447) - IPaddr2: optionally flush kernel routing table on interface stop - IPaddr2: exit with the right code when not properly configured - IPaddr2: exit early and with the right code if the ip parameter is not set - IPaddr2: unique_clone_address should work without CIP (lf#2442) - IPaddr: return the correct code if interface delete failed - IPv6addr: allow link-local addresses in case the interface name is provided - IPv6addr: interface index in /proc/net/if_inet6 may be longer than 2 chars (lf#2462) - IPsrcaddr: exit with the right code when not properly configured - IPsrcaddr: add the cidr_netmask parameter - Tools: findif: differentiate between error conditions - nfsserver: fix the default string for the notification parameter - nfsserver: don't use -v in the notify cmd with rpc.statd - iSCSITarget: fix race for target IDs when using IET (lf#2432) - iSCSITarget: follow changed IET access policy - Raid1: Support attempting to re-add mirrors on deep monitor action (bnc#619121) - Raid1: Fix graceful stop code path - Raid1: Handle stop for failed arrays properly (bnc#618775) - sfex: output log messages also to stderr in sfex_init - sfex: add the sfex_stat command - sfex: wait in the start and stop actions until sfex_daemon starts/exits - Xen: implement stop of a migrating domain (bnc#656227) - Xen: check the allow_mem_management boolean properly (bnc#637525) - Xen: Always run destroy in stop sequence. - Xen: use xen-list command for status check if available (bnc#628735) - Xen: use xen-destroy for stop, if available. - Xen: Allow node configurable attribute to specify which IP to use for live migration (bnc#628735) - VirtualDomain: fix spurious stop failures - VirtualDomain: don't timeout in stop before escalating to "forced stop" - ManageVE: add migration capability - MailTo: don't check if user exists for email address (might be an alias or remote) - CTDB: Remove hard-coded timeout on start op - CTDB: Don't manage Samba and Winbind by default - CTDB: Deprecate (and make optional) smb_private_dir param (bnc#623788) - tomcat: Ensure name of tomcat resource is only used on start operation and expose JAVA_OPTS variable for use - tomcat: Fix to ensure default OCF_RESKEY_xx values are observed - tomcat: Add CATALINA_BASE parameter, defaults to CATALINA_HOME, permits multiple tomcat instances - tomcat: Use Tomcat stop TIMEOUT -force to improve stop - Dummy: migrate_from/to: correct OCF_RESKEY_CRM_meta_migrate_xxx variable names - Dummy: make method reload work - anything: add the workdir parameter - mysql: clone and master-slave functionality - mysql: add replication monitoring - mysql: check for write permissions after creating pid and socket directory - mysql: make client binary path configurable - pgsql: cd to pgdata before running commands (fixes permission error) - pgsql: add optional username, password, and sqlcode parameters for monitor - pgsql: add new "config" parameter - pgsql: properly implement pghost parameter - pgsql: socketdir parameter to manage non-default UNIX socket directories - oracle: reduce output from sqlplus to the last line for queries (bnc#567815) - db2: Replace call to db2_local_ps with db2nps - db2: guard against a hanging db2stop by spawning this into the background. Use db2_kill after grace period. - db2: add multi partition support - db2: improve behaviour on probes - db2: support for v9.x instances (bnc#608952) - SAPDatabase,SAPInstance: improve LD_LIBRARY_PATH processing (bnc#640026) - SAPInstance: prevent premature expansion of [:upper:] [:lower:] when producing sidadm uid - SAPInstance: Moved testing of SAP profile directory and START profile to a later stage (only when needed), for more robustness - SAPInstance: fix return codes in probes - SAPInstance: New parameter: SHUTDOWN_METHOD - SAPInstance: ensure enqueue failover in monitor_clone on process failure - SAPInstance: don't rely on op target rc when monitoring clones (lf#2371) - SAPDatabase: prevent premature expansion of [:upper:] and [:lower:] when producing sidadm/orasid/db2sid uids - SAPdatabase: Changed Oracle recovery method from "recover automatic database" to "end backup" - SAPDatabase: Adapt process search pattern for DB/2 9.5 - SAPDatabase: start listener only if database processes are found - SAPDatabase: avoid continuous output to syslog in monitor with SAP 7.20 and J2EE_ONLY=1 - ldirectord: http: connect to server instead of protocol (Debian#594958) - ldirectord: add implicit support for submission RFC4409 - ldirectord: example configuration for a submission virtual service - ldirectord: Shutdown write-side of client connection after writing has finished - ldirectord: port number mismatch of imaps and pops - ldirectord: Oracle compatibility - ldirectord: don't exit on timeout in HTTP/HTTPS check - ldirectord: allow underscore in service name - ldirectord: use $1 instead of \1 in pattern replace (bnc#605086) - Tools: ocf-tester: Extend to cover initial probe (monitor_0) test. - Tools: ocf-tester: set and export some common meta variables (lf#2524) - Tools: ocf-tester: meta-data also should never be affected by missing binaries. - Tools: ocf-tester: show output from the agent in case of error * Tue Apr 13 2010 Dejan Muhamedagic and others - stable release 1.0.3 - meta-data: improve timeouts in most resource agents (reduce the number of warnings by the shell) - RA: log messages to stderr if attached to a terminal - ocf-shellfuncs: tests to check for clone/ms resources - ocf-shellfuncs: don't output to stderr if using syslog (prevents double logging from the RA and lrmd) - make sure that OCF_RESKEY_CRM_meta_interval is always defined (lf#2284) - ocft: new RA test suite - VirtualDomain: bail out early if config file can't be read during probe (nbc#593988) - VirtualDomain: spin on define until we definitely have a domain name - VirtualDomain: fix incorrect use of __OCF_ACTION (the stop operation may timeout otherwise) - Filesystem: prefer /proc/mounts to /etc/mtab for non-bind mounts (lf#2388) - IPaddr2: don't bring the interface down on stop (otherwise IPv6 addresses may be removed) - oracle/oralsnr: improve exit codes if the environment isn't valid - oracle/oralsnr: improve logging - Route: don't assume that OCF_RESKEY_CRM_meta_clone_node_max is set to a number (lf#2375) - Route: add route table parameter (lf#2335) - sfex: don't use pid file (lf#2363,bnc#585416) - SFEX daemon: fix logging - ldirectord: fix the configfile default (bnc#589457) - drbd: fix metadata (bnc#588684) - IPsrcaddr: modify the interface route (lf#2367) - ldirectord: Allow multiple email addresses (lf#2168) - vmware: fix set_environment() invocation (lf#2342) - vmware: updated to version 0.2 - apache: return the right exit code from monitor (bnc#578628) - iSCSILogicalUnit: fix monitor for STGT * Mon Feb 01 2010 Dejan Muhamedagic and others - stable release 1.0.2 - EvmsSCC, Evmsd, LinuxSCSI, drbd, pingd: marked as deprecated (lf#2244) - CTDB: new resource agent for clustered samba - postfix: new resource agent - proftpd: new resource agent - AoEtarget: new resource agent to export ATA-over-Ethernet (AoE) targets - Squid: new resource agent - VirtualDomain: new resource agent (manage virtual domains using libvirt/virsh) - anything: new resource agent for arbitrary daemons - mysql-proxy: new resource agent - iSCSITarget/iSCSILogicalUnit: two new resource agents - portblock: fast reconnect/tickle ACK (new feature) - IPv6addr: new nic and cidr_netmask parameters - mysql-proxy: log_level and keepalive parameters - Filesystem: implement deep monitor operation - apache: monitor operation of depth 10 for web applications (lf#2234) - SAPDatabase + SAPInstance: New versions from SAP - CTDB: auto-generate cluster-specific part of smb.conf (lf#2308) - ClusterMon: don't fail in stop if the process is missing (bnc#569957) - Filesystem: allow configuring smbfs mounts as clones - IPaddr2: CLUSTERIP/iptables rule not always inserted on failed monitor (lf#2281) - IPaddr2: behave if the interface is down (lf#2147) - IPaddr2: check binaries when it makes sense - IPaddr2: fix invalid default value for OCF_RESKEY_clusterip_hash (bnc#553753) - IPaddr2: include netmask in search for the right interface - IPaddr2: remove all colons from the mac address before passing it to send_arp (lf#2165) - IPsrcaddr: replace 0/0 with proper ip prefix - IPv6addr: recognize network masks properly - IPv6addr: supply checksum for ICMPv6 packets - IPv6addr: ifdef out the ip offset hack for libnet v1.1.4 (lf#2034) - IPv6addr: supply checksum for ICMPv6 packets - LVM: Make monitor operation quiet in logs (bnc#546353) - MailTo: Provide a default for MAILCMD (bnc#534803, bnc#556366) - MailTo: allow multiple word subject line - Raid1: improve monitor function (bnc#546551) - Route: improve validate (lf#2232) - Squid: make the regexp match more precisely output of netstat - VIParip: Pathname needed to be configurable (lf#1331) - VirtualDomain: avoid needlessly invoking "virsh define" - VirtualDomain: destroy domain shortly before timeout expiry - VirtualDomain: fix forceful stop (lf#2283) - VirtualDomain: loop on status if libvirtd is unreachable - Xen: Remove instance_attribute "allow_migrate" (bnc#539968) - apache: make sure that proxies are not used for monitor - iSCSILogicalUnit: add support for SCSI ID, SCSI SN, Vendor ID, and Product ID - iSCSILogicalUnit: add support for per-LU parameters - iSCSILogicalUnit: set default for SCSI SN, truncate SCSI ID default to 24 bytes - iSCSILogicalUnit: use a 16-byte default SCSI ID - iSCSITarget, iSCSILogicalUnit: add support for tgt - iSCSITarget: reintroduce "tid" parameter - iSCSITarget, iSCSILogicalUnit: identify targets by IQN, not by tid - iSCSITarget, iSCSILogicalUnit: support LIO - iSCSITarget: add support for CHAP authentication - iSCSITarget: add support for restricting target access - iSCSITarget: be more persistent deleting targets on stop - include ldirectord (formerly known as heartbeat-ldirectord) - iscsi: replace wrong variable reference (bnc#499291) - jboss: Added JBoss support - ldirectord: fix setting defaults for configfile and ldirectord (lf#2328) - ldirectord: fix various bugs in OCF RA (lf#1949) - mysql: escalate stop to KILL if regular shutdown doesn't work - mysql: handle monitor and stop properly on invalid environment - nfsserver: use default values (lf#2321) - nfsserver: validate should not check if nfs_shared_infodir exists (lf#2219) - nfsserver: use check_binary properly in validate (lf#2211) - nfsserver: exit properly in nfsserver_validate (lf#2173) - oracle/oralsnr: export variables properly - oracle: drop spurious output from sqlplus - pgsql: remove the previous backup_label if it exists - portblock: add per-IP filtering capability - portblock: fix invalid exit codes on monitor - postfix: fix double stop - scsi2reservation: fix wrong logic in check for scsi_reserve - vmware: make meta-data work and several cleanups (lf#2212) - shellfuncs: make the mktemp wrappers work - ocf-shellfuncs: add mercurial repository version information - ocf-shellfuncs: add ocf_is_probe function - doc: add resource agents' man pages including examples * Thu Oct 23 2008 Lars Marowsky-Bree and MANY others - beta release 2.99.2 - LVM: stop correctly in case vol group does not exist * Tue Sep 23 2008 Lars Marowsky-Bree and MANY others - beta release 2.99.1 * Tue Aug 19 2008 Andrew Beekhof and MANY others - beta release 2.99.0 diff --git a/ci/build.sh b/ci/build.sh index 798bd39ee..4c26ab9ce 100755 --- a/ci/build.sh +++ b/ci/build.sh @@ -1,46 +1,74 @@ #!/usr/bin/env bash -set -eo pipefail +set -o pipefail [[ "${DEBUG:-}" ]] && set -x declare -i failed failed=0 +# SC2046: Quote this to prevent word splitting. +# SC1090: Can't follow non-constant source. Use a directive to specify location. +# SC2039: In POSIX sh, 'local' is undefined. +# SC2086: Double quote to prevent globbing and word splitting. +# SC2154: var is referenced but not assigned. +ignored_errors="SC1090,SC2039,SC2154" + success() { - printf "\r\033[2K [ \033[00;32mOK\033[0m ] Checking %s...\n" "$1" + printf "\r\033[2K [ \033[00;32mOK\033[0m ] Checking %s...\n" "$1" +} + +warn() { + printf "\r\033[2K [\033[0;33mWARNING\033[0m] Checking %s...\n" "$1" } fail() { printf "\r\033[2K [\033[0;31mFAIL\033[0m] Checking %s...\n" "$1" failed=1 } check() { - local script="$1" - shellcheck "$script" || fail "$script" - success "$script" + local script="$1" + + out="$(shellcheck -s sh -f gcc -x -e "$ignored_errors" "$script" 2>&1)" + rc=$? + if [ $rc -eq 0 ]; then + success "$script" + elif echo "$out" | grep -i 'error' >/dev/null; then + fail "$script" + else + warn "$script" + fi + echo "$out" } find_prunes() { - local prunes="! -path './.git/*'" - if [ -f .gitmodules ]; then - while read module; do - prunes="$prunes ! -path './$module/*'" - done < <(grep path .gitmodules | awk '{print $3}') - fi - echo "$prunes" + local prunes="! -path './.git/*'" + if [ -f .gitmodules ]; then + while read -r module; do + prunes="$prunes ! -path './$module/*'" + done < <(grep path .gitmodules | awk '{print $3}') + fi + echo "$prunes" } find_cmd() { - echo "find . -type f -and \( -perm +111 -or -name '*.sh' \) $(find_prunes)" + echo "find heartbeat -type f -and \( -perm /111 -or -name '*.sh' \) $(find_prunes)" } check_all_executables() { - echo "Checking executables and .sh files..." - eval "$(find_cmd)" | while read script; do - head=$(head -n1 "$script") - check "$script" - done - exit $failed + echo "Checking executables and .sh files..." + while read -r script; do + head=$(head -n1 "$script") + [[ "$head" =~ .*ruby.* ]] && continue + [[ "$head" =~ .*zsh.* ]] && continue + [[ "$head" =~ ^#compdef.* ]] && continue + [[ "$head" =~ ^.*\.c ]] && continue + [[ "$head" =~ ^ldirectord.in ]] && continue + check "$script" + done < <(eval "$(find_cmd)") + exit $failed } +./autogen.sh +./configure +make check_all_executables diff --git a/ci/install.sh b/ci/install.sh deleted file mode 100755 index c66b56c59..000000000 --- a/ci/install.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -set -eo pipefail - -main() { - local filename="shellcheck_0.3.7-1_amd64.deb" - wget "http://ftp.debian.org/debian/pool/main/s/shellcheck/$filename" - sudo dpkg -i "$filename" -} - -main diff --git a/doc/dev-guides/ra-dev-guide.txt b/doc/dev-guides/ra-dev-guide.asc similarity index 100% rename from doc/dev-guides/ra-dev-guide.txt rename to doc/dev-guides/ra-dev-guide.asc diff --git a/doc/man/Makefile.am b/doc/man/Makefile.am index 5e2889513..43a3f70c8 100644 --- a/doc/man/Makefile.am +++ b/doc/man/Makefile.am @@ -1,174 +1,176 @@ # # doc: Linux-HA resource agents # # Copyright (C) 2009 Florian Haas # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in EXTRA_DIST = $(doc_DATA) $(REFENTRY_STYLESHEET) \ mkappendix.sh ralist.sh CLEANFILES = $(man_MANS) $(xmlfiles) metadata-*.xml STYLESHEET_PREFIX ?= http://docbook.sourceforge.net/release/xsl/current MANPAGES_STYLESHEET ?= $(STYLESHEET_PREFIX)/manpages/docbook.xsl HTML_STYLESHEET ?= $(STYLESHEET_PREFIX)/xhtml/docbook.xsl FO_STYLESHEET ?= $(STYLESHEET_PREFIX)/fo/docbook.xsl REFENTRY_STYLESHEET ?= ra2refentry.xsl XSLTPROC_OPTIONS ?= --xinclude XSLTPROC_MANPAGES_OPTIONS ?= $(XSLTPROC_OPTIONS) XSLTPROC_HTML_OPTIONS ?= $(XSLTPROC_OPTIONS) XSLTPROC_FO_OPTIONS ?= $(XSLTPROC_OPTIONS) radir = $(top_srcdir)/heartbeat # OCF_ROOT=. is necessary due to a sanity check in ocf-shellfuncs # (which tests whether $OCF_ROOT points to a directory metadata-%.xml: $(radir)/% OCF_ROOT=. OCF_FUNCTIONS_DIR=$(radir) $< meta-data > $@ metadata-IPv6addr.xml: ../../heartbeat/IPv6addr OCF_ROOT=. OCF_FUNCTIONS_DIR=$(radir) $< meta-data > $@ # Please note: we can't name the man pages # ocf:heartbeat:. Believe me, I've tried. It looks like it # works, but then it doesn't. While make can deal correctly with # colons in target names (when properly escaped), it royally messes up # when it is deals with _dependencies_ that contain colons. See Bug # 12126 on savannah.gnu.org. But, maybe it gets fixed soon, it was # first reported in 1995 and added to Savannah in in 2005... if BUILD_DOC man_MANS = ocf_heartbeat_AoEtarget.7 \ ocf_heartbeat_AudibleAlarm.7 \ ocf_heartbeat_ClusterMon.7 \ ocf_heartbeat_CTDB.7 \ ocf_heartbeat_Delay.7 \ ocf_heartbeat_Dummy.7 \ ocf_heartbeat_EvmsSCC.7 \ ocf_heartbeat_Evmsd.7 \ ocf_heartbeat_Filesystem.7 \ ocf_heartbeat_ICP.7 \ ocf_heartbeat_IPaddr.7 \ ocf_heartbeat_IPaddr2.7 \ ocf_heartbeat_IPsrcaddr.7 \ ocf_heartbeat_LVM.7 \ ocf_heartbeat_LinuxSCSI.7 \ ocf_heartbeat_MailTo.7 \ ocf_heartbeat_ManageRAID.7 \ ocf_heartbeat_ManageVE.7 \ ocf_heartbeat_Pure-FTPd.7 \ ocf_heartbeat_Raid1.7 \ ocf_heartbeat_Route.7 \ ocf_heartbeat_SAPDatabase.7 \ ocf_heartbeat_SAPInstance.7 \ ocf_heartbeat_SendArp.7 \ ocf_heartbeat_ServeRAID.7 \ ocf_heartbeat_SphinxSearchDaemon.7 \ ocf_heartbeat_Squid.7 \ ocf_heartbeat_Stateful.7 \ ocf_heartbeat_SysInfo.7 \ ocf_heartbeat_VIPArip.7 \ ocf_heartbeat_VirtualDomain.7 \ ocf_heartbeat_WAS.7 \ ocf_heartbeat_WAS6.7 \ ocf_heartbeat_WinPopup.7 \ ocf_heartbeat_Xen.7 \ ocf_heartbeat_Xinetd.7 \ ocf_heartbeat_anything.7 \ ocf_heartbeat_apache.7 \ ocf_heartbeat_asterisk.7 \ ocf_heartbeat_clvm.7 \ ocf_heartbeat_conntrackd.7 \ ocf_heartbeat_db2.7 \ ocf_heartbeat_dhcpd.7 \ ocf_heartbeat_docker.7 \ ocf_heartbeat_dnsupdate.7 \ ocf_heartbeat_eDir88.7 \ ocf_heartbeat_ethmonitor.7 \ ocf_heartbeat_exportfs.7 \ ocf_heartbeat_fio.7 \ ocf_heartbeat_galera.7 \ + ocf_heartbeat_garbd.7 \ ocf_heartbeat_iSCSILogicalUnit.7 \ ocf_heartbeat_iSCSITarget.7 \ ocf_heartbeat_iface-bridge.7 \ ocf_heartbeat_iface-vlan.7 \ ocf_heartbeat_ids.7 \ ocf_heartbeat_iscsi.7 \ ocf_heartbeat_jboss.7 \ ocf_heartbeat_kamailio.7 \ ocf_heartbeat_lxc.7 \ ocf_heartbeat_mysql.7 \ ocf_heartbeat_mysql-proxy.7 \ + ocf_heartbeat_nagios.7 \ ocf_heartbeat_named.7 \ ocf_heartbeat_nfsnotify.7 \ ocf_heartbeat_nfsserver.7 \ ocf_heartbeat_nginx.7 \ ocf_heartbeat_oracle.7 \ ocf_heartbeat_oralsnr.7 \ ocf_heartbeat_pgsql.7 \ ocf_heartbeat_pingd.7 \ ocf_heartbeat_portblock.7 \ ocf_heartbeat_postfix.7 \ ocf_heartbeat_pound.7 \ ocf_heartbeat_proftpd.7 \ ocf_heartbeat_rabbitmq-cluster.7 \ ocf_heartbeat_redis.7 \ ocf_heartbeat_rsyncd.7 \ ocf_heartbeat_rsyslog.7 \ ocf_heartbeat_scsi2reservation.7 \ ocf_heartbeat_sfex.7 \ ocf_heartbeat_slapd.7 \ ocf_heartbeat_sg_persist.7 \ ocf_heartbeat_symlink.7 \ ocf_heartbeat_syslog-ng.7 \ ocf_heartbeat_tomcat.7 \ ocf_heartbeat_varnish.7 \ ocf_heartbeat_vmware.7 \ ocf_heartbeat_zabbixserver.7 if USE_IPV6ADDR_AGENT man_MANS += ocf_heartbeat_IPv6addr.7 endif xmlfiles = $(man_MANS:.7=.xml) %.1 %.5 %.7 %.8: %.xml $(XSLTPROC) \ $(XSLTPROC_MANPAGES_OPTIONS) \ $(MANPAGES_STYLESHEET) $< ocf_heartbeat_%.xml: metadata-%.xml $(srcdir)/$(REFENTRY_STYLESHEET) $(XSLTPROC) --novalid \ --stringparam package $(PACKAGE_NAME) \ --stringparam version $(VERSION) \ --output $@ \ $(srcdir)/$(REFENTRY_STYLESHEET) $< ocf_resource_agents.xml: $(xmlfiles) mkappendix.sh ./mkappendix.sh $(xmlfiles) > $@ %.html: %.xml $(XSLTPROC) \ $(XSLTPROC_HTML_OPTIONS) \ --output $@ \ $(HTML_STYLESHEET) $< xml: ocf_resource_agents.xml endif diff --git a/doc/man/mkappendix.sh b/doc/man/mkappendix.sh index 8f8a6220c..8f3ed3d27 100755 --- a/doc/man/mkappendix.sh +++ b/doc/man/mkappendix.sh @@ -1,18 +1,18 @@ #!/bin/sh cat < Resource agent manual pages EOF -for manpage in `printf "%s\n" $@ | sort -f`; do +for manpage in $(printf "%s\n" "$@" | sort -f); do cat < EOF done cat < EOF diff --git a/doc/man/ralist.sh b/doc/man/ralist.sh index ef8f528a6..31444b6e2 100755 --- a/doc/man/ralist.sh +++ b/doc/man/ralist.sh @@ -1,9 +1,9 @@ #!/bin/sh RADIR=$1 PREFIX=$2 SUFFIX=$3 -for f in `find $RADIR -type f -executable`; do - echo ${PREFIX}`basename $f`${SUFFIX} +find "$RADIR" -type f -executable | while read -r file; do + echo "${PREFIX}$(basename "$file")${SUFFIX}" done diff --git a/heartbeat/Delay b/heartbeat/Delay index 9cfa939d6..f9d303bf8 100755 --- a/heartbeat/Delay +++ b/heartbeat/Delay @@ -1,223 +1,223 @@ #!/bin/sh # # # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # # This script is a test resource for introducing delay. # # usage: $0 {start|stop|status|monitor|meta-data} # # OCF parameters are as below: # OCF_RESKEY_startdelay # OCF_RESKEY_stopdelay # OCF_RESKEY_mondelay # # -# OCF_RESKEY_startdelay defaults to 30 (seconds) +# OCF_RESKEY_startdelay defaults to 20 (seconds) # OCF_RESKEY_stopdelay defaults to $OCF_RESKEY_startdelay # OCF_RESKEY_mondelay defaults to $OCF_RESKEY_startdelay # # # This is really a test resource script. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### usage() { cat <<-! usage: $0 {start|stop|status|monitor|meta-data|validate-all} ! } meta_data() { cat < 1.0 This script is a test resource for introducing delay. Waits for a defined timespan How long in seconds to delay on start operation. Start delay - + How long in seconds to delay on stop operation. Defaults to "startdelay" if unspecified. Stop delay How long in seconds to delay on monitor operation. Defaults to "startdelay" if unspecified. Monitor delay END } Delay_stat() { ha_pseudo_resource Delay_${OCF_RESOURCE_INSTANCE} monitor } Delay_Status() { if Delay_stat then ocf_log info "Delay is running OK" return $OCF_SUCCESS else ocf_log info "Delay is stopped" return $OCF_NOT_RUNNING fi } Delay_Monitor() { Delay_Validate_All -q sleep $OCF_RESKEY_mondelay Delay_Status } Delay_Start() { if Delay_stat then ocf_log info "Delay already running." return $OCF_SUCCESS else Delay_Validate_All -q ha_pseudo_resource Delay_${OCF_RESOURCE_INSTANCE} start rc=$? sleep $OCF_RESKEY_startdelay if [ $rc -ne 0 ] then return $OCF_ERR_PERM fi return $OCF_SUCCESS fi } Delay_Stop() { if Delay_stat then Delay_Validate_All -q ha_pseudo_resource Delay_${OCF_RESOURCE_INSTANCE} stop rc=$? sleep $OCF_RESKEY_stopdelay if [ $rc -ne 0 ] then return $OCF_ERR_PERM fi return $OCF_SUCCESS else ocf_log info "Delay already stopped." return $OCF_SUCCESS fi } # Check if all the arguments are valid numbers, a string is considered valid if: # 1. It does not contain any character but digits and period "."; # 2. The period "." does not occur more than once Are_Valid_Numbers() { for i in "$@"; do echo $i |grep -v [^0-9.] |grep -q -v [.].*[.] if test $? -ne 0; then return $OCF_ERR_ARGS fi done return $OCF_SUCCESS } Delay_Validate_All() { # Be quiet when specified -q option _and_ validation succeded getopts "q" option if test $option = "q"; then quiet=yes else quiet=no fi shift $(($OPTIND -1)) if Are_Valid_Numbers $OCF_RESKEY_startdelay $OCF_RESKEY_stopdelay \ $OCF_RESKEY_mondelay; then if test $quiet = "no"; then echo "Validate OK" fi # _Return_ on validation success return $OCF_SUCCESS else ocf_exit_reason "Some of the instance parameters are invalid" # _Exit_ on validation failure exit $OCF_ERR_ARGS fi } if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi -: ${OCF_RESKEY_startdelay=30} +: ${OCF_RESKEY_startdelay=20} : ${OCF_RESKEY_stopdelay=$OCF_RESKEY_startdelay} : ${OCF_RESKEY_mondelay=$OCF_RESKEY_startdelay} case $1 in meta-data) meta_data exit $OCF_SUCCESS ;; start) Delay_Start ;; stop) Delay_Stop ;; monitor) Delay_Monitor ;; status) Delay_Status ;; validate-all) Delay_Validate_All ;; usage) usage exit $OCF_SUCCESS ;; *) usage exit $OCF_ERR_ARGS ;; esac exit $? diff --git a/heartbeat/Filesystem b/heartbeat/Filesystem index e59414896..96673f90d 100755 --- a/heartbeat/Filesystem +++ b/heartbeat/Filesystem @@ -1,883 +1,883 @@ #!/bin/sh # # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # # Filesystem # Description: Manages a Filesystem on a shared storage medium. # Original Author: Eric Z. Ayers (eric.ayers@compgen.com) # Original Release: 25 Oct 2000 # # usage: ./Filesystem {start|stop|status|monitor|validate-all|meta-data} # # OCF parameters are as below: # OCF_RESKEY_device # OCF_RESKEY_directory # OCF_RESKEY_fstype # OCF_RESKEY_options # OCF_RESKEY_statusfile_prefix # OCF_RESKEY_run_fsck # OCF_RESKEY_fast_stop # OCF_RESKEY_force_clones # #OCF_RESKEY_device : name of block device for the filesystem. e.g. /dev/sda1, /dev/md0 # Or a -U or -L option for mount, or an NFS mount specification #OCF_RESKEY_directory : the mount point for the filesystem #OCF_RESKEY_fstype : optional name of the filesystem type. e.g. ext2 #OCF_RESKEY_options : options to be given to the mount command via -o #OCF_RESKEY_statusfile_prefix : the prefix used for a status file for monitoring #OCF_RESKEY_run_fsck : fsck execution mode: auto(default)/force/no #OCF_RESKEY_fast_stop : fast stop: yes(default)/no #OCF_RESKEY_force_clones : allow running the resource as clone. e.g. local xfs mounts # for each brick in a glusterfs setup # # # This assumes you want to manage a filesystem on a shared (SCSI) bus, # on a replicated device (such as DRBD), or a network filesystem (such # as NFS or Samba). # # Do not put this filesystem in /etc/fstab. This script manages all of # that for you. # # NOTE: If 2 or more nodes mount the same file system read-write, and # that file system is not designed for that specific purpose # (such as GFS or OCFS2), and is not a network file system like # NFS or Samba, then the filesystem is going to become # corrupted. # # As a result, you should use this together with the stonith # option and redundant, independent communications paths. # # If you don't do this, don't blame us when you scramble your # disk. ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults DFLT_STATUSDIR=".Filesystem_status/" # Variables used by multiple methods HOSTOS=`uname` # The status file is going to an extra directory, by default # prefix=${OCF_RESKEY_statusfile_prefix} : ${prefix:=$DFLT_STATUSDIR} suffix="${OCF_RESOURCE_INSTANCE}" [ "$OCF_RESKEY_CRM_meta_clone" ] && suffix="${suffix}_$OCF_RESKEY_CRM_meta_clone" suffix="${suffix}_`uname -n`" STATUSFILE=${OCF_RESKEY_directory}/$prefix$suffix ####################################################################### usage() { cat <<-EOT usage: $0 {start|stop|status|monitor|validate-all|meta-data} EOT } meta_data() { cat < 1.1 Resource script for Filesystem. It manages a Filesystem on a shared storage medium. The standard monitor operation of depth 0 (also known as probe) checks if the filesystem is mounted. If you want deeper tests, set OCF_CHECK_LEVEL to one of the following values: 10: read first 16 blocks of the device (raw read) This doesn't exercise the filesystem at all, but the device on which the filesystem lives. This is noop for non-block devices such as NFS, SMBFS, or bind mounts. 20: test if a status file can be written and read The status file must be writable by root. This is not always the case with an NFS mount, as NFS exports usually have the "root_squash" option set. In such a setup, you must either use read-only monitoring (depth=10), export with "no_root_squash" on your NFS server, or grant world write permissions on the directory where the status file is to be placed. Manages filesystem mounts The name of block device for the filesystem, or -U, -L options for mount, or NFS mount specification. block device The mount point for the filesystem. mount point The type of filesystem to be mounted. filesystem type Any extra options to be given as -o options to mount. For bind mounts, add "bind" here and set fstype to "none". We will do the right thing for options such as "bind,ro". options The prefix to be used for a status file for resource monitoring with depth 20. If you don't specify this parameter, all status files will be created in a separate directory. status file prefix Specify how to decide whether to run fsck or not. "auto" : decide to run fsck depending on the fstype(default) "force" : always run fsck regardless of the fstype "no" : do not run fsck ever. run_fsck Normally, we expect no users of the filesystem and the stop operation to finish quickly. If you cannot control the filesystem users easily and want to prevent the stop action from failing, then set this parameter to "no" and add an appropriate timeout for the stop operation. fast stop The use of a clone setup for local filesystems is forbidden by default. For special setups like glusterfs, cloning a mount of a local device with a filesystem like ext4 or xfs independently on several nodes is a valid use case. Only set this to "true" if you know what you are doing! allow running as a clone, regardless of filesystem type This option allows specifying how to handle processes that are currently accessing the mount directory. "true" : Default value, kill processes accessing mount point "safe" : Kill processes accessing mount point using methods that avoid functions that could potentially block during process detection "false" : Do not kill any processes. The 'safe' option uses shell logic to walk the /procs/ directory for pids using the mount point while the default option uses the fuser cli tool. fuser is known to perform operations that can potentially block if unresponsive nfs mounts are in use on the system. Kill processes before unmount END } # # Make sure the kernel does the right thing with the FS buffers # This function should be called after unmounting and before mounting # It may not be necessary in 2.4 and later kernels, but it shouldn't hurt # anything either... # # It's really a bug that you have to do this at all... # flushbufs() { if have_binary $BLOCKDEV ; then if [ "$blockdevice" = "yes" ] ; then $BLOCKDEV --flushbufs $1 return $? fi fi return 0 } # Take advantage of /etc/mtab if present, use portable mount command # otherwise. Normalize format to "dev mountpoint fstype". is_bind_mount() { echo "$options" | grep -w bind >/dev/null 2>&1 } list_mounts() { local inpf="" if [ -e "/proc/mounts" ] && ! is_bind_mount; then inpf=/proc/mounts elif [ -f "/etc/mtab" -a -r "/etc/mtab" ]; then inpf=/etc/mtab fi if [ "$inpf" ]; then cut -d' ' -f1,2,3 < $inpf else $MOUNT | cut -d' ' -f1,3,5 fi } determine_blockdevice() { if [ $blockdevice = "yes" ]; then return fi # Get the current real device name, if possible. # (specified devname could be -L or -U...) case "$FSTYPE" in nfs4|nfs|smbfs|cifs|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs|none) : ;; *) DEVICE=`list_mounts | grep " $MOUNTPOINT " | cut -d' ' -f1` if [ -b "$DEVICE" ]; then blockdevice=yes fi ;; esac } # Lists all filesystems potentially mounted under a given path, # excluding the path itself. list_submounts() { list_mounts | grep " $1/" | cut -d' ' -f2 | sort -r } # kernels < 2.6.26 can't handle bind remounts bind_kernel_check() { echo "$options" | grep -w ro >/dev/null 2>&1 || return uname -r | awk -F. ' $1==2 && $2==6 { sub("[^0-9].*","",$3); if ($3<26) exit(1); }' [ $? -ne 0 ] && ocf_log warn "kernel `uname -r` cannot handle read only bind mounts" } bind_mount() { if is_bind_mount && [ "$options" != "-o bind" ] then bind_kernel_check bind_opts=`echo $options | sed 's/bind/remount/'` $MOUNT $bind_opts $MOUNTPOINT else true # make sure to return OK fi } is_option() { echo $OCF_RESKEY_options | grep -w "$1" >/dev/null 2>&1 } is_fsck_needed() { case $OCF_RESKEY_run_fsck in force) true;; no) false;; ""|auto) case $FSTYPE in ext4|ext4dev|ext3|reiserfs|reiser4|nss|xfs|jfs|vfat|fat|nfs4|nfs|cifs|smbfs|ocfs2|gfs2|none|lustre|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs) false;; *) true;; esac;; *) ocf_log warn "Invalid parameter value for fsck: '$OCF_RESKEY_run_fsck'; setting to 'auto'" OCF_RESKEY_run_fsck="auto" is_fsck_needed;; esac } fstype_supported() { local support="$FSTYPE" local rc if [ "X${HOSTOS}" != "XOpenBSD" ];then # skip checking /proc/filesystems for obsd return $OCF_SUCCESS fi if [ -z "$FSTYPE" -o "$FSTYPE" = none ]; then : No FSTYPE specified, rely on the system has the right file-system support already return $OCF_SUCCESS fi # support fuse-filesystems (e.g. GlusterFS) case $FSTYPE in fuse.*|glusterfs|rozofs) support="fuse";; esac grep -w "$support"'$' /proc/filesystems >/dev/null if [ $? -eq 0 ]; then # found the fs type return $OCF_SUCCESS fi # if here, we should attempt to load the module and then # check the if the filesystem support exists again. $MODPROBE $support >/dev/null if [ $? -ne 0 ]; then ocf_exit_reason "Couldn't find filesystem $FSTYPE in /proc/filesystems and failed to load kernel module" return $OCF_ERR_INSTALLED fi # It is possible for the module to load and not be complete initialized # before we check /proc/filesystems again. Give this a few trys before # giving up entirely. for try in $(seq 5); do grep -w "$support"'$' /proc/filesystems >/dev/null if [ $? -eq 0 ] ; then # yes. found the filesystem after doing the modprobe return $OCF_SUCCESS fi ocf_log debug "Unable to find support for $FSTYPE in /proc/filesystems after modprobe, trying again" sleep 1 done ocf_exit_reason "Couldn't find filesystem $FSTYPE in /proc/filesystems" return $OCF_ERR_INSTALLED } # # START: Start up the filesystem # Filesystem_start() { # See if the device is already mounted. if Filesystem_status >/dev/null 2>&1 ; then ocf_log info "Filesystem $MOUNTPOINT is already mounted." return $OCF_SUCCESS fi fstype_supported || exit $OCF_ERR_INSTALLED # Check the filesystem & auto repair. # NOTE: Some filesystem types don't need this step... Please modify # accordingly if [ $blockdevice = "yes" ]; then if [ "$DEVICE" != "/dev/null" -a ! -b "$DEVICE" ] ; then ocf_exit_reason "Couldn't find device [$DEVICE]. Expected /dev/??? to exist" exit $OCF_ERR_INSTALLED fi if is_fsck_needed; then ocf_log info "Starting filesystem check on $DEVICE" if [ -z "$FSTYPE" ]; then $FSCK -p $DEVICE else $FSCK -t $FSTYPE -p $DEVICE fi # NOTE: if any errors at all are detected, it returns non-zero # if the error is >= 4 then there is a big problem if [ $? -ge 4 ]; then ocf_exit_reason "Couldn't successfully fsck filesystem for $DEVICE" return $OCF_ERR_GENERIC fi fi fi [ -d "$MOUNTPOINT" ] || ocf_run mkdir -p $MOUNTPOINT if [ ! -d "$MOUNTPOINT" ] ; then ocf_exit_reason "Couldn't find directory [$MOUNTPOINT] to use as a mount point" exit $OCF_ERR_INSTALLED fi flushbufs $DEVICE # Mount the filesystem. case "$FSTYPE" in none) $MOUNT $options $DEVICE $MOUNTPOINT && bind_mount ;; "") $MOUNT $options $DEVICE $MOUNTPOINT ;; *) $MOUNT -t $FSTYPE $options $DEVICE $MOUNTPOINT ;; esac if [ $? -ne 0 ]; then - ocf_exit_reason "Couldn't mount filesystem $DEVICE on $MOUNTPOINT" + ocf_exit_reason "Couldn't mount device [$DEVICE] as $MOUNTPOINT" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # end of Filesystem_start get_pids() { local dir=$1 local procs local mmap_procs if ocf_is_true "$FORCE_UNMOUNT"; then if [ "X${HOSTOS}" = "XOpenBSD" ];then fstat | grep $dir | awk '{print $3}' else $FUSER -m $dir 2>/dev/null fi elif [ "$FORCE_UNMOUNT" = "safe" ]; then procs=$(find /proc/[0-9]*/ -type l -lname "${dir}/*" -or -lname "${dir}" 2>/dev/null | awk -F/ '{print $3}') mmap_procs=$(grep " ${dir}" /proc/[0-9]*/maps | awk -F/ '{print $3}') printf "${procs}\n${mmap_procs}" | sort | uniq fi } signal_processes() { local dir=$1 local sig=$2 local pids pid # fuser returns a non-zero return code if none of the # specified files is accessed or in case of a fatal # error. pids=$(get_pids "$dir") if [ -z "$pids" ]; then ocf_log info "No processes on $dir were signalled. force_unmount is set to '$FORCE_UNMOUNT'" return fi for pid in $pids; do ocf_log info "sending signal $sig to: `ps -f $pid | tail -1`" kill -s $sig $pid done } try_umount() { local SUB=$1 $UMOUNT $umount_force $SUB list_mounts | grep -q " $SUB " >/dev/null 2>&1 || { ocf_log info "unmounted $SUB successfully" return $OCF_SUCCESS } return $OCF_ERR_GENERIC } fs_stop() { local SUB=$1 timeout=$2 sig cnt for sig in TERM KILL; do cnt=$((timeout/2)) # try half time with TERM while [ $cnt -gt 0 ]; do try_umount $SUB && return $OCF_SUCCESS ocf_exit_reason "Couldn't unmount $SUB; trying cleanup with $sig" signal_processes $SUB $sig cnt=$((cnt-1)) sleep 1 done done return $OCF_ERR_GENERIC } # # STOP: Unmount the filesystem # Filesystem_stop() { # See if the device is currently mounted Filesystem_status >/dev/null 2>&1 if [ $? -eq $OCF_NOT_RUNNING ]; then # Already unmounted, wonderful. rc=$OCF_SUCCESS else # Wipe the status file, but continue with a warning if # removal fails -- the file system might be read only if [ $OCF_CHECK_LEVEL -eq 20 ]; then rm -f ${STATUSFILE} if [ $? -ne 0 ]; then ocf_log warn "Failed to remove status file ${STATUSFILE}." fi fi # Determine the real blockdevice this is mounted on (if # possible) prior to unmounting. determine_blockdevice # For networked filesystems, there's merit in trying -f: case "$FSTYPE" in nfs4|nfs|cifs|smbfs) umount_force="-f" ;; esac # Umount all sub-filesystems mounted under $MOUNTPOINT/ too. local timeout for SUB in `list_submounts $MOUNTPOINT` $MOUNTPOINT; do ocf_log info "Trying to unmount $SUB" if ocf_is_true "$FAST_STOP"; then timeout=6 else timeout=${OCF_RESKEY_CRM_meta_timeout:="20000"} timeout=$((timeout/1000)) fi fs_stop $SUB $timeout rc=$? if [ $rc -ne $OCF_SUCCESS ]; then ocf_exit_reason "Couldn't unmount $SUB, giving up!" fi done fi flushbufs $DEVICE return $rc } # end of Filesystem_stop # # STATUS: is the filesystem mounted or not? # Filesystem_status() { if list_mounts | grep -q " $MOUNTPOINT " >/dev/null 2>&1; then rc=$OCF_SUCCESS msg="$MOUNTPOINT is mounted (running)" else rc=$OCF_NOT_RUNNING msg="$MOUNTPOINT is unmounted (stopped)" fi # Special case "monitor" to check whether the UUID cached and # on-disk still match? case "$OP" in status) ocf_log info "$msg";; esac return $rc } # end of Filesystem_status # Note: the read/write tests below will stall in case the # underlying block device (or in the case of a NAS mount, the # NAS server) has gone away. In that case, if I/O does not # return to normal in time, the operation hits its timeout # and it is up to the CRM to initiate appropriate recovery # actions (such as fencing the node). # # MONITOR 10: read the device # Filesystem_monitor_10() { if [ "$blockdevice" = "no" ] ; then ocf_log warn "$DEVICE is not a block device, monitor 10 is noop" return $OCF_SUCCESS fi dd_opts="iflag=direct bs=4k count=1" err_output=`dd if=$DEVICE $dd_opts 2>&1 >/dev/null` if [ $? -ne 0 ]; then ocf_exit_reason "Failed to read device $DEVICE" ocf_log err "dd said: $err_output" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # # MONITOR 20: write and read a status file # Filesystem_monitor_20() { if [ "$blockdevice" = "no" ] ; then # O_DIRECT not supported on cifs/smbfs dd_opts="oflag=sync bs=4k conv=fsync,sync" else # Writing to the device in O_DIRECT mode is imperative # to bypass caches. dd_opts="oflag=direct,sync bs=4k conv=fsync,sync" fi status_dir=`dirname $STATUSFILE` [ -d "$status_dir" ] || mkdir -p "$status_dir" err_output=`echo "${OCF_RESOURCE_INSTANCE}" | dd of=${STATUSFILE} $dd_opts 2>&1` if [ $? -ne 0 ]; then ocf_exit_reason "Failed to write status file ${STATUSFILE}" ocf_log err "dd said: $err_output" return $OCF_ERR_GENERIC fi test -f ${STATUSFILE} if [ $? -ne 0 ]; then ocf_exit_reason "Cannot stat the status file ${STATUSFILE}" return $OCF_ERR_GENERIC fi cat ${STATUSFILE} > /dev/null if [ $? -ne 0 ]; then ocf_exit_reason "Cannot read the status file ${STATUSFILE}" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } Filesystem_monitor() { Filesystem_status rc=$? if [ $rc -ne $OCF_SUCCESS ]; then return $rc fi if [ $rc -eq $OCF_SUCCESS -a $OCF_CHECK_LEVEL -gt 0 ]; then case "$OCF_CHECK_LEVEL" in 10) Filesystem_monitor_10; rc=$?;; 20) Filesystem_monitor_20; rc=$?;; *) ocf_exit_reason "unsupported monitor level $OCF_CHECK_LEVEL" rc=$OCF_ERR_CONFIGURED ;; esac fi return $rc } # end of Filesystem_monitor # # VALIDATE_ALL: Are the instance parameters valid? # FIXME!! The only part that's useful is the return code. # This code always returns $OCF_SUCCESS (!) # Filesystem_validate_all() { if [ -n $MOUNTPOINT -a ! -d $MOUNTPOINT ]; then ocf_log warn "Mountpoint $MOUNTPOINT does not exist" fi # Check if the $FSTYPE is workable # NOTE: Without inserting the $FSTYPE module, this step may be imprecise # TODO: This is Linux specific crap. if [ ! -z "$FSTYPE" -a "$FSTYPE" != none ]; then cut -f2 /proc/filesystems |grep -q ^$FSTYPE$ if [ $? -ne 0 ]; then modpath=/lib/modules/`uname -r` moddep=$modpath/modules.dep # Do we have $FSTYPE in modules.dep? cut -d' ' -f1 $moddep |grep -q "^$modpath.*$FSTYPE\.k\?o:$" if [ $? -ne 0 ]; then ocf_log info "It seems we do not have $FSTYPE support" fi fi fi # If we are supposed to do monitoring with status files, then # we need a utility to write in O_DIRECT mode. if [ $OCF_CHECK_LEVEL -gt 0 ]; then check_binary dd # Note: really old coreutils version do not support # the "oflag" option for dd. We don't check for that # here. In case dd does not support oflag, monitor is # bound to fail, with dd spewing an error message to # the logs. On such systems, we must do without status # file monitoring. fi #TODO: How to check the $options ? return $OCF_SUCCESS } # # set the blockdevice variable to "no" or "yes" # set_blockdevice_var() { blockdevice=no # these are definitely not block devices case $FSTYPE in nfs4|nfs|smbfs|cifs|none|glusterfs|ceph|tmpfs|overlay|overlayfs|rozofs) return;; esac if `is_option "loop"`; then return fi case $DEVICE in -*) # Oh... An option to mount instead... Typically -U or -L ;; /dev/null) # Special case for BSC blockdevice=yes ;; *) if [ ! -b "$DEVICE" -a ! -d "$DEVICE" -a "X$OP" != Xstart ] ; then ocf_log warn "Couldn't find device [$DEVICE]. Expected /dev/??? to exist" fi if [ ! -d "$DEVICE" ]; then blockdevice=yes fi ;; esac } # Check the arguments passed to this script if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi # Check the OCF_RESKEY_ environment variables... FORCE_UNMOUNT="yes" if [ -n "${OCF_RESKEY_force_unmount}" ]; then FORCE_UNMOUNT=$OCF_RESKEY_force_unmount fi DEVICE=$OCF_RESKEY_device FSTYPE=$OCF_RESKEY_fstype if [ ! -z "$OCF_RESKEY_options" ]; then options="-o $OCF_RESKEY_options" fi FAST_STOP=${OCF_RESKEY_fast_stop:="yes"} OP=$1 # These operations do not require instance parameters case $OP in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; esac if [ x = x"$DEVICE" ]; then ocf_exit_reason "Please set OCF_RESKEY_device to the device to be managed" exit $OCF_ERR_CONFIGURED fi set_blockdevice_var # Normalize instance parameters: # It is possible that OCF_RESKEY_directory has one or even multiple trailing "/". # But the output of `mount` and /proc/mounts do not. if [ -z "$OCF_RESKEY_directory" ]; then if [ X$OP = "Xstart" -o $blockdevice = "no" ]; then ocf_exit_reason "Please specify the directory" exit $OCF_ERR_CONFIGURED fi else MOUNTPOINT=$(echo $OCF_RESKEY_directory | sed 's/\/*$//') : ${MOUNTPOINT:=/} # At this stage, $MOUNTPOINT does not contain trailing "/" unless it is "/" # TODO: / mounted via Filesystem sounds dangerous. On stop, we'll # kill the whole system. Is that a good idea? fi # Check to make sure the utilites are found if [ "X${HOSTOS}" != "XOpenBSD" ];then check_binary $MODPROBE check_binary $FUSER fi check_binary $FSCK check_binary $MOUNT check_binary $UMOUNT if [ "$OP" != "monitor" ]; then ocf_log info "Running $OP for $DEVICE on $MOUNTPOINT" fi case $OP in status) Filesystem_status exit $? ;; monitor) Filesystem_monitor exit $? ;; validate-all) Filesystem_validate_all exit $? ;; stop) Filesystem_stop exit $? ;; esac CLUSTERSAFE=0 is_option "ro" && CLUSTERSAFE=2 case $FSTYPE in nfs4|nfs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs) CLUSTERSAFE=1 # this is kind of safe too ;; # add here CLUSTERSAFE=0 for all filesystems which are not # cluster aware and which, even if when mounted read-only, # could still modify parts of it such as journal/metadata ext4|ext4dev|ext3|reiserfs|reiser4|xfs|jfs) if ocf_is_true "$OCF_RESKEY_force_clones"; then CLUSTERSAFE=2 else CLUSTERSAFE=0 # these are not allowed fi ;; esac if ocf_is_clone; then case $CLUSTERSAFE in 0) ocf_exit_reason "DANGER! $FSTYPE on $DEVICE is NOT cluster-aware!" ocf_log err "DO NOT RUN IT AS A CLONE!" ocf_log err "Politely refusing to proceed to avoid data corruption." exit $OCF_ERR_CONFIGURED ;; 2) ocf_log warn "$FSTYPE on $DEVICE is NOT cluster-aware!" if ocf_is_true "$OCF_RESKEY_force_clones"; then ocf_log warn "But we'll let it run because we trust _YOU_ verified it's safe to do so." else ocf_log warn "But we'll let it run because it is mounted read-only." ocf_log warn "Please make sure that it's meta data is read-only too!" fi ;; esac fi case $OP in start) Filesystem_start ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/LVM b/heartbeat/LVM index 5d202c6f2..49ebce85c 100755 --- a/heartbeat/LVM +++ b/heartbeat/LVM @@ -1,713 +1,713 @@ #!/bin/sh # # # LVM # # Description: Manages an LVM volume as an HA resource # # # Author: Alan Robertson # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # Copyright: (C) 2002 - 2005 International Business Machines, Inc. # # This code significantly inspired by the LVM resource # in FailSafe by Lars Marowsky-Bree # # # An example usage in /etc/ha.d/haresources: # node1 10.0.0.170 ServeRAID::1::1 LVM::myvolname # # See usage() function below for more details... # # OCF parameters are as below: # OCF_RESKEY_volgrpname # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### usage() { methods=`LVM_methods` methods=`echo $methods | tr ' ' '|'` cat < 1.0 Resource script for LVM. It manages an Linux Volume Manager volume (LVM) as an HA resource. Controls the availability of an LVM Volume Group The name of volume group. Volume group name If set, the volume group will be activated exclusively. This option works one of two ways. If the volume group has the cluster attribute set, then the volume group will be activated exclusively using clvmd across the cluster. If the cluster attribute is not set, the volume group will be activated exclusively using a tag and the volume_list filter. When the tag option is in use, the volume_list in lvm.con must be initialized. This can be as simple as setting 'volume_list = []' depending on your setup. Exclusive activation If "exclusive" is set on a non clustered volume group, this overrides the tag to be used. Exclusive activation tag If set, the volume group will be activated partially even with some physical volumes missing. It helps to set to true when using mirrored logical volumes. Activate VG partially when missing PVs EOF } # # methods: What methods/operations do we support? # LVM_methods() { cat < /dev/null 2>&1 if [ $? -ne 0 ]; then return fi ## # Now check to see if the initrd has been updated. # If not, the machine could boot and activate the VG outside # the control of pacemaker ## if [ "$(find /boot -name *.img -newer /etc/lvm/lvm.conf)" = "" ]; then ocf_log warn "LVM: Improper setup detected" ocf_log warn "* initrd image needs to be newer than lvm.conf" # While dangerous if not done the first time, there are many # cases where we don't simply want to fail here. Instead, # keep warning until the user remakes the initrd - or has # it done for them by upgrading the kernel. # # initrd can be updated using this command. # dracut -H -f /boot/initramfs-$(uname -r).img $(uname -r) # fi } ## # does this vg have our tag ## check_tags() { local owner=`vgs -o tags --noheadings $OCF_RESKEY_volgrpname | tr -d ' '` if [ -z "$owner" ]; then # No-one owns this VG yet return 1 fi if [ "$OUR_TAG" = "$owner" ]; then # yep, this is ours return 0 fi # some other tag is set on this vg return 2 } strip_tags() { local i for i in `vgs --noheadings -o tags $OCF_RESKEY_volgrpname | sed s/","/" "/g`; do ocf_log info "Stripping tag, $i" # LVM version 2.02.98 allows changing tags if PARTIAL vgchange --deltag $i $OCF_RESKEY_volgrpname done if [ ! -z `vgs -o tags --noheadings $OCF_RESKEY_volgrpname | tr -d ' '` ]; then ocf_exit_reason "Failed to remove ownership tags from $OCF_RESKEY_volgrpname" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } set_tags() { check_tags case $? in 0) # we already own it. return $OCF_SUCCESS ;; 2) # other tags are set, strip them before setting if ! strip_tags; then return $OCF_ERR_GENERIC fi ;; *) : ;; esac vgchange --addtag $OUR_TAG $OCF_RESKEY_volgrpname if [ $? -ne 0 ]; then ocf_exit_reason "Failed to add ownership tag to $OCF_RESKEY_volgrpname" return $OCF_ERR_GENERIC fi ocf_log info "New tag \"$OUR_TAG\" added to $OCF_RESKEY_volgrpname" return $OCF_SUCCESS } # # Return LVM status (silently) # LVM_status() { local rc=1 loglevel="debug" # Set the log level of the error message if [ "X${2}" = "X" ]; then loglevel="err" if ocf_is_probe; then loglevel="warn" else if [ ${OP_METHOD} = "stop" ]; then loglevel="info" fi fi fi if [ -d /dev/$1 ]; then test "`cd /dev/$1 && ls`" != "" rc=$? if [ $rc -ne 0 ]; then ocf_exit_reason "VG $1 with no logical volumes is not supported by this RA!" fi fi if [ $rc -ne 0 ]; then ocf_log $loglevel "LVM Volume $1 is not available (stopped)" rc=$OCF_NOT_RUNNING else case $(get_vg_mode) in 1) # exclusive with tagging. # If vg is running, make sure the correct tag is present. Otherwise we # can not guarantee exclusive activation. if ! check_tags; then ocf_exit_reason "WARNING: $OCF_RESKEY_volgrpname is active without the cluster tag, \"$OUR_TAG\"" rc=$OCF_ERR_GENERIC fi # make sure the environment for tags activation is still valid if ! verify_tags_environment; then rc=$OCF_ERR_GENERIC fi # let the user know if their initrd is older than lvm.conf. check_initrd_warning ;; *) : ;; esac fi if [ "X${2}" = "X" ]; then # status call return return $rc fi # Report on LVM volume status to stdout... if [ $rc -eq 0 ]; then echo "Volume $1 is available (running)" else echo "Volume $1 is not available (stopped)" fi return $rc } get_activate_options() { local options="-a" case $(get_vg_mode) in 0) options="${options}ly";; 1) options="${options}y --config activation{volume_list=[\"@${OUR_TAG}\"]}";; 2) options="${options}ey";; esac if ocf_is_true "$OCF_RESKEY_partial_activation" ; then options="${options} --partial" fi # for clones (clustered volume groups), we'll also have to force # monitoring, even if disabled in lvm.conf. if ocf_is_clone; then options="$options --monitor y" fi echo $options } ## # Attempt to deactivate vg cluster wide and then start the vg exclusively ## retry_exclusive_start() { - local vgchange_options=$(get_activate_options) + local vgchange_options="$(get_activate_options)" # Deactivate each LV in the group one by one cluster wide set -- $(lvs -o name,attr --noheadings $OCF_RESKEY_volgrpname 2> /dev/null) while [ $# -ge 2 ]; do case $2 in ????ao*) # open LVs cannot be deactivated. return $OCF_ERR_GENERIC;; *) if ! lvchange -an $OCF_RESKEY_volgrpname/$1; then ocf_exit_reason "Unable to perform required deactivation of $OCF_RESKEY_volgrpname/$1 before starting" return $OCF_ERR_GENERIC fi ;; esac shift 2 done ocf_run vgchange $vgchange_options $OCF_RESKEY_volgrpname } # # Enable LVM volume # LVM_start() { - local vgchange_options=$(get_activate_options) + local vgchange_options="$(get_activate_options)" local vg=$1 local clvmd=0 # TODO: This MUST run vgimport as well ocf_log info "Activating volume group $vg" if [ "$LVM_MAJOR" -eq "1" ]; then ocf_run vgscan $vg else ocf_run vgscan fi case $(get_vg_mode) in 2) clvmd=1 ;; 1) if ! set_tags; then return $OCF_ERR_GENERIC fi ;; *) : ;; esac if ! ocf_run vgchange $vgchange_options $vg; then if [ $clvmd -eq 0 ]; then return $OCF_ERR_GENERIC fi # Failure to exclusively activate cluster vg.: # This could be caused by a remotely active LV, Attempt # to disable volume group cluster wide and try again. # Allow for some settling sleep 5 if ! retry_exclusive_start; then return $OCF_ERR_GENERIC fi fi if LVM_status $vg; then : OK Volume $vg activated just fine! return $OCF_SUCCESS else ocf_exit_reason "LVM: $vg did not activate correctly" return $OCF_NOT_RUNNING fi } # # Disable the LVM volume # LVM_stop() { local res=$OCF_ERR_GENERIC local vgchange_options="-aln" local vg=$1 if ! vgs $vg > /dev/null 2>&1; then ocf_log info "Volume group $vg not found" return $OCF_SUCCESS fi ocf_log info "Deactivating volume group $vg" case $(get_vg_mode) in 1) vgchange_options="-an" ;; esac for i in $(seq 10) do ocf_run vgchange $vgchange_options $vg res=$? if LVM_status $vg; then ocf_exit_reason "LVM: $vg did not stop correctly" res=1 fi if [ $res -eq 0 ]; then break fi res=$OCF_ERR_GENERIC ocf_log warn "$vg still Active" ocf_log info "Retry deactivating volume group $vg" sleep 1 which udevadm > /dev/null 2>&1 && udevadm settle --timeout=5 done case $(get_vg_mode) in 1) if [ $res -eq 0 ]; then strip_tags res=$? fi ;; esac return $res } # # Check whether the OCF instance parameters are valid # LVM_validate_all() { check_binary $AWK ## # lvmetad is a daemon that caches lvm metadata to improve the # performance of LVM commands. This daemon should never be used when # volume groups exist that are being managed by the cluster. The lvmetad # daemon introduces a response lag, where certain LVM commands look like # they have completed (like vg activation) when in fact the command # is still in progress by the lvmetad. This can cause reliability issues # when managing volume groups in the cluster. For Example, if you have a # volume group that is a dependency for another application, it is possible # the cluster will think the volume group is activated and attempt to start # the application before volume group is really accesible... lvmetad is bad. ## lvm dumpconfig global/use_lvmetad | grep 'use_lvmetad.*=.*1' > /dev/null 2>&1 if [ $? -eq 0 ]; then # for now warn users that lvmetad is enabled and that they should disable it. In the # future we may want to consider refusing to start, or killing the lvmetad daemon. ocf_log warn "Disable lvmetad in lvm.conf. lvmetad should never be enabled in a clustered environment. Set use_lvmetad=0 and kill the lvmetad process" fi ## # Off-the-shelf tests... ## VGOUT=`vgck ${VOLUME} 2>&1` if [ $? -ne 0 ]; then # Inconsistency might be due to missing physical volumes, which doesn't # automatically mean we should fail. If partial_activation=true then # we should let start try to handle it, or if no PVs are listed as # "unknown device" then another node may have marked a device missing # where we have access to all of them and can start without issue. if vgs -o pv_attr --noheadings $OCF_RESKEY_volgrpname 2>/dev/null | grep 'm' > /dev/null 2>&1; then if vgs -o pv_name --noheadings $OCF_RESKEY_volgrpname 2>/dev/null | grep 'unknown device' > /dev/null 2>&1; then if ! ocf_is_true "$OCF_RESKEY_partial_activation" ; then # We are missing devices and cannot activate partially ocf_exit_reason "Volume group [$VOLUME] has devices missing. Consider partial_activation=true to attempt to activate partially" exit $OCF_ERR_GENERIC else # We are missing devices but are allowed to activate partially. # Assume that caused the vgck failure and carry on ocf_log warn "Volume group inconsistency detected with missing device(s) and partial_activation enabled. Proceeding with requested action." fi fi # else the vg is partial but all devices are accounted for, so another # node must have marked the device missing. Proceed. else # vgck failure was for something other than missing devices ocf_exit_reason "Volume group [$VOLUME] does not exist or contains error! ${VGOUT}" exit $OCF_ERR_GENERIC fi fi ## # Does the Volume Group exist? ## if [ "$LVM_MAJOR" = "1" ]; then VGOUT=`vgdisplay ${VOLUME} 2>&1` else VGOUT=`vgdisplay -v ${VOLUME} 2>&1` fi if [ $? -ne 0 ]; then ocf_exit_reason "Volume group [$VOLUME] does not exist or contains error! ${VGOUT}" exit $OCF_ERR_GENERIC fi ## # If exclusive activation is not enabled, then # further checking of proper setup is not necessary ## if ! ocf_is_true "$OCF_RESKEY_exclusive"; then return $OCF_SUCCESS; fi ## # Having cloned lvm resources with exclusive vg activation makes no sense at all. ## if ocf_is_clone; then ocf_exit_reason "cloned lvm resources can not be activated exclusively" exit $OCF_ERR_CONFIGURED fi ## # Make sure the cluster attribute is set and clvmd is up when exclusive # activation is enabled. Otherwise we can't exclusively activate the volume group. ## case $(get_vg_mode) in 1) # exclusive activation using tags if ! verify_tags_environment; then exit $OCF_ERR_GENERIC fi ;; 2) # exclusive activation with clvmd ## # verify is clvmd running ## if ! ps -C clvmd > /dev/null 2>&1; then ocf_exit_reason "$OCF_RESKEY_volgrpname has the cluster attribute set, but 'clvmd' is not running" exit $OCF_ERR_GENERIC fi ;; *) : ;; esac return $OCF_SUCCESS } # # 'main' starts here... # if [ $# -ne 1 ] then usage exit $OCF_ERR_ARGS fi case $1 in meta-data) meta_data exit $OCF_SUCCESS;; methods) LVM_methods exit $?;; usage) usage exit $OCF_SUCCESS;; *) ;; esac if [ -z "$OCF_RESKEY_volgrpname" ] then ocf_exit_reason "You must identify the volume group name!" exit $OCF_ERR_CONFIGURED fi # Get the LVM version number, for this to work we assume(thanks to panjiam): # # LVM1 outputs like this # # # vgchange --version # vgchange: Logical Volume Manager 1.0.3 # Heinz Mauelshagen, Sistina Software 19/02/2002 (IOP 10) # # LVM2 and higher versions output in this format # # # vgchange --version # LVM version: 2.00.15 (2004-04-19) # Library version: 1.00.09-ioctl (2004-03-31) # Driver version: 4.1.0 LVM_VERSION=`vgchange --version 2>&1 | \ $AWK '/Logical Volume Manager/ {print $5"\n"; exit; } /LVM version:/ {printf $3"\n"; exit;}'` rc=$? if ( [ $rc -ne 0 ] || [ -z "$LVM_VERSION" ] ) then ocf_exit_reason "LVM: $1 could not determine LVM version. Try 'vgchange --version' manually and modify $0 ?" exit $OCF_ERR_INSTALLED fi LVM_MAJOR="${LVM_VERSION%%.*}" VOLUME=$OCF_RESKEY_volgrpname OP_METHOD=$1 if [ -n "$OCF_RESKEY_tag" ]; then OUR_TAG=$OCF_RESKEY_tag fi # What kind of method was invoked? case "$1" in start) LVM_validate_all LVM_start $VOLUME exit $?;; stop) LVM_stop $VOLUME exit $?;; status) LVM_status $VOLUME $1 exit $?;; monitor) LVM_status $VOLUME exit $?;; validate-all) LVM_validate_all ;; *) usage exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/Makefile.am b/heartbeat/Makefile.am index b70c10404..df0e3b8b6 100644 --- a/heartbeat/Makefile.am +++ b/heartbeat/Makefile.am @@ -1,160 +1,161 @@ # Makefile.am for OCF RAs # # Author: Sun Jing Dong # Copyright (C) 2004 IBM # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in EXTRA_DIST = $(ocf_SCRIPTS) $(ocfcommon_DATA) \ $(common_DATA) $(hb_DATA) $(dtd_DATA) \ README AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/linux-ha halibdir = $(libexecdir)/heartbeat ocfdir = $(OCF_RA_DIR_PREFIX)/heartbeat dtddir = $(datadir)/$(PACKAGE_NAME) dtd_DATA = ra-api-1.dtd if USE_IPV6ADDR_AGENT ocf_PROGRAMS = IPv6addr else ocf_PROGRAMS = endif if IPV6ADDR_COMPATIBLE halib_PROGRAMS = send_ua else halib_PROGRAMS = endif IPv6addr_SOURCES = IPv6addr.c IPv6addr_utils.c send_ua_SOURCES = send_ua.c IPv6addr_utils.c IPv6addr_LDADD = -lplumb $(LIBNETLIBS) send_ua_LDADD = $(LIBNETLIBS) ocf_SCRIPTS = ClusterMon \ CTDB \ Dummy \ IPaddr \ IPaddr2 \ anything \ AoEtarget \ apache \ asterisk \ nginx \ AudibleAlarm \ clvm \ conntrackd \ db2 \ dhcpd \ Delay \ dnsupdate \ docker \ eDir88 \ EvmsSCC \ Evmsd \ ethmonitor \ exportfs \ Filesystem \ fio \ galera \ + garbd \ ids \ iscsi \ ICP \ IPsrcaddr \ iSCSITarget \ iSCSILogicalUnit \ iface-bridge \ iface-vlan \ jboss \ kamailio \ LinuxSCSI \ LVM \ lxc \ MailTo \ ManageRAID \ ManageVE \ mysql \ mysql-proxy \ nagios \ named \ nfsnotify \ nfsserver \ oracle \ oralsnr \ pingd \ portblock \ postfix \ pound \ pgsql \ proftpd \ Pure-FTPd \ rabbitmq-cluster \ Raid1 \ redis \ Route \ rsyncd \ rsyslog \ SAPDatabase \ SAPInstance \ SendArp \ ServeRAID \ slapd \ SphinxSearchDaemon \ Squid \ Stateful \ SysInfo \ scsi2reservation \ sfex \ sg_persist \ symlink \ syslog-ng \ tomcat \ VIPArip \ VirtualDomain \ varnish \ vmware \ WAS \ WAS6 \ WinPopup \ Xen \ Xinetd \ zabbixserver ocfcommondir = $(OCF_LIB_DIR_PREFIX)/heartbeat ocfcommon_DATA = ocf-shellfuncs \ ocf-binaries \ ocf-directories \ ocf-returncodes \ ocf-rarun \ ocf-distro \ apache-conf.sh \ http-mon.sh \ sapdb-nosha.sh \ sapdb.sh \ ora-common.sh \ mysql-common.sh \ nfsserver-redhat.sh \ findif.sh # Legacy locations hbdir = $(sysconfdir)/ha.d hb_DATA = shellfuncs diff --git a/heartbeat/README.galera b/heartbeat/README.galera index 56390e60b..dd45618e3 100644 --- a/heartbeat/README.galera +++ b/heartbeat/README.galera @@ -1,132 +1,148 @@ Notes regarding the Galera resource agent --- In the resource agent, the action of bootstrapping a Galera cluster is implemented into a series of small steps, by using: * Two CIB attributes `last-committed` and `bootstrap` to elect a bootstrap node that will restart the cluster. * One CIB attribute `sync-needed` that will identify that joining nodes are in the process of synchronizing their local database via SST. * A Master/Slave pacemaker resource which helps splitting the boot into steps, up to a point where a galera node is available. * the recurring monitor action to coordinate switch from one state to another. How boot works ==== There are two things to know to understand how the resource agent restart a Galera cluster. ### Bootstrap the cluster with the right node -When synced, the nodes of a galera clusters have in common a last seqno, +When synced, the nodes of a galera cluster have in common a last seqno, which identifies the last transaction considered successful by a majority of nodes in the cluster (think quorum). To restart a cluster, the resource agent must ensure that it will bootstrap the cluster from an node which is up-to-date, i.e which has the highest seqno of all nodes. As a result, if the resource agent cannot retrieve the seqno on all nodes, it won't be able to safely identify a bootstrap node, and will simply refuse to start the galera cluster. ### synchronizing nodes can be a long operation Starting a bootstrap node is relatively fast, so it's performed during the "promote" operation, which is a one-off, time-bounded operation. Subsequent nodes will need to synchronize via SST, which consists in "pushing" an entire Galera DB from one node to another. There is no perfect time-out, as time spent during synchronization depends on the size of the DB. Thus, joiner nodes are started during the "monitor" operation, which is a recurring operation that can better track the progress of the SST. State flow ==== General idea for starting Galera: * Before starting the Galera cluster each node needs to go in Slave state so that the agent records its last seqno into the CIB. __ This uses attribute last-committed __ * When all node went in Slave, the agent can safely determine the last seqno and elect a bootstrap node (`detect_first_master()`). __ This uses attribute bootstrap __ * The agent then sets the score of the elected bootstrap node to Master so that pacemaker promote it and start the first Galera server. * Once the first Master is running, the agent can start joiner nodes during the "monitor" operation, and starts monitoring their SST sync. __ This uses attribute sync-needed __ * Only when SST is over on joiner nodes, the agent promotes them to Master. At this point, the entire Galera cluster is up. Attribute usage and liveness ==== Here is how attributes are created on a per-node basis. If you modify the resource agent make sure those properties still hold. ### last-committed It is just a temporary hint for the resource agent to help elect a bootstrap node. Once the bootstrap attribute is set on one of the nodes, we can get rid of last-committed. - Used : during Slave state to compare seqno - Created: before entering Slave state: . at startup in `galera_start()` . or when a Galera node is stopped in `galera_demote()` - Deleted: just before node starts in `galera_start_local_node()`; cleaned-up during `galera_demote()` and `galera_stop()` We delete last-committed before starting Galera, to avoid race conditions that could arise due to discrepancies between the CIB and Galera. ### bootstrap Attribute set on the node that is elected to bootstrap Galera. - Used : during promotion in `galera_start_local_node()` - Created: at startup once all nodes have `last-committed`; or during monitor if all nodes have failed - Deleted: in `galera_start_local_node()`, just after the bootstrap node started and is ready; cleaned-up during `galera_demote()` and `galera_stop()` There cannot be more than one bootstrap node at any time, otherwise the Galera cluster would stop replicating properly. ### sync-needed While this attribute is set on a node, the Galera node is in JOIN state, i.e. SST is in progress and the node cannot serve queries. The resource agent relies on the underlying SST method to monitor the progress of the SST. For instance, with `wsrep_sst_rsync`, timeout would be reported by rsync, the Galera node would go in Non-primary state, which would make `galera_monitor()` fail. - Used : during recurring slave monitor in `check_sync_status()` - Created: in `galera_start_local_node()`, just after the joiner node started and entered the Galera cluster - Deleted: during recurring slave monitor in `check_sync_status()` as soon as the Galera code reports to be SYNC-ed. + +### no-grastate + +If a galera node was unexpectedly killed in a middle of a replication, +InnoDB can retain the equivalent of a XA transaction in prepared state +in its redo log. If so, mysqld cannot recover state (nor last seqno) +automatically, and special recovery heuristic has to be used to +unblock the node. + +This transient attribute is used to keep track of forced recoveries to +prevent bootstrapping a cluster from a recovered node when possible. + +- Used : during `detect_first_master()` to elect the bootstrap node +- Created: in `detect_last_commit()` if the node has a pending XA + transaction to recover in the redo log +- Deleted: when a node is promoted to Master. diff --git a/heartbeat/SAPDatabase b/heartbeat/SAPDatabase index de7959fee..641bd4086 100755 --- a/heartbeat/SAPDatabase +++ b/heartbeat/SAPDatabase @@ -1,331 +1,341 @@ #!/bin/sh # # SAPDatabase # # Description: Manages any type of SAP supported database instance # as a High-Availability OCF compliant resource. # # Author: Alexander Krauth, October 2006 # Support: linux@sap.com # License: GNU General Public License (GPL) # Copyright: (c) 2006, 2007, 2010, 2012 Alexander Krauth # # An example usage: # See usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_SID # OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) # OCF_RESKEY_DBTYPE (mandatory, one of the following values: ORA,ADA,DB6,SYB,HDB) # OCF_RESKEY_DBINSTANCE (optional, Database instance name, if not equal to SID) +# OCF_RESKEY_DBOSUSER (optional, the Linux user that owns the database processes on operating system level) # OCF_RESKEY_STRICT_MONITORING (optional, activate application level monitoring - with Oracle a failover will occur in case of an archiver stuck) # OCF_RESKEY_AUTOMATIC_RECOVER (optional, automatic startup recovery, default is false) # OCF_RESKEY_MONITOR_SERVICES (optional, default is to monitor all database services) # OCF_RESKEY_PRE_START_USEREXIT (optional, lists a script which can be executed before the resource is started) # OCF_RESKEY_POST_START_USEREXIT (optional, lists a script which can be executed after the resource is started) # OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be executed before the resource is stopped) # OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be executed after the resource is stopped) # Deprecated parameters: # OCF_RESKEY_NETSERVICENAME # OCF_RESKEY_DBJ2EE_ONLY # OCF_RESKEY_JAVA_HOME # OCF_RESKEY_DIR_BOOTSTRAP # OCF_RESKEY_DIR_SECSTORE # OCF_RESKEY_DB_JARS # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### SH=/bin/sh usage() { methods=`sapdatabase_methods` methods=`echo $methods | tr ' ' '|'` cat <<-! usage: $0 ($methods) $0 manages a SAP database of any type as an HA resource. Currently Oracle, MaxDB, DB/2 UDB, Sybase ASE and SAP HANA Database are supported. ABAP databases as well as JAVA only databases are supported. The 'start' operation starts the instance. The 'stop' operation stops the instance. The 'status' operation reports whether the instance is running The 'monitor' operation reports whether the instance seems to be working The 'recover' operation tries to recover the instance after a crash (instance will be stopped first!) The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports ! } meta_data() { cat < -2.06 +2.14 Manages a SAP database instance as an HA resource. Resource script for SAP databases. It manages a SAP database of any type as an HA resource. The purpose of the resource agent is to start, stop and monitor the database instance of a SAP system. Together with the RDBMS system it will also control the related network service for the database. Like the Oracle Listener and the xserver of MaxDB. The resource agent expects a standard SAP installation of the database and therefore needs less parameters to configure. The resource agent supports the following databases: - Oracle 10.2, 11.2 and 12 - DB/2 UDB for Windows and Unix 9.x - SAP-DB / MaxDB 7.x - Sybase ASE 15.7 - SAP HANA Database since 1.00 - with SAP node 1625203 (http://sdn.sap.com) In fact this resource agent does not run any database commands directly. It uses the SAP standard process SAPHostAgent to control the database. The SAPHostAgent must be installed on each cluster node locally. It will not work, if you try to run the SAPHostAgent also as a HA resource. Please follow SAP note 1031096 for the installation of SAPHostAgent. The required minimum version of SAPHostAgent is: Release: 7.20 Patch Number: 90 or compile time after: Dec 17 2011 The unique database system identifier. e.g. P01 Database system ID The full qualified path where to find saphostexec and saphostctrl. Usually you can leave this empty. Then the default: /usr/sap/hostctrl/exe is used. path of saphostexec and saphostctrl The name of the database vendor you use. Set either: ADA, DB6, ORA, SYB, HDB database vendor Must be used for special database implementations, when database instance name is not equal to the SID (e.g. Oracle DataGuard) Database instance name, if not equal to SID + + The parameter can be set, if the database processes on operating system level are not executed with the default user of the used database type. Defaults: ADA=taken from /etc/opt/sdb, DB6=db2SID, ORA=oraSID and oracle, SYB=sybSID, HDB=SIDadm + the Linux user that owns the database processes on operating system level + + Deprecated - do not use anymore. This parameter will be deleted in one of the next releases. deprecated - do not use anymore Deprecated - do not use anymore. This parameter will be deleted in one of the next releases. deprecated - do not use anymore Deprecated - do not use anymore. This parameter will be deleted in one of the next releases. deprecated - do not use anymore This controls how the resource agent monitors the database. If set to true, it will use 'saphostctrl -function GetDatabaseStatus' to test the database state. If set to false, only operating system processes are monitored. Activates application level monitoring If you set this to true, 'saphostctrl -function StartDatabase' will always be called with the '-force' option. Enable or disable automatic startup recovery Defines which services are monitored by the SAPDatabase resource agent, if STRICT_MONITORING is set to true. Service names must correspond with the output of the 'saphostctrl -function GetDatabaseStatus' command. Database services to monitor Deprecated - do not use anymore. This parameter will be deleted in one of the next releases. deprecated - do not use anymore Deprecated - do not use anymore. This parameter will be deleted in one of the next releases. deprecated - do not use anymore Deprecated - do not use anymore. This parameter will be deleted in one of the next releases. deprecated - do not use anymore The full qualified path where to find a script or program which should be executed before this resource gets started. path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got started. path to a post-start script The full qualified path where to find a script or program which should be executed before this resource gets stopped. path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got stopped. path to a post-start script END } # # methods: What methods/operations do we support? # sapdatabase_methods() { cat <<-! start stop status monitor recover validate-all methods meta-data usage ! } # # sapuserexit : Many SAP customers need some additional processes/tools to run their SAP systems. # This specialties do not allow a totally generic SAP cluster resource agent. # Someone should write a resource agent for each additional process you need, if it # is required to monitor that process within the cluster manager. To enable # you to extent this resource agent without developing a new one, this user exit # was introduced. # sapuserexit() { NAME="$1" VALUE="$2" if [ -n "$VALUE" ] then if have_binary "$VALUE" then ocf_log info "Calling userexit ${NAME} with customer script file ${VALUE}" "$VALUE" >/dev/null 2>&1 ocf_log info "Exiting userexit ${NAME} with customer script file ${VALUE}, returncode: $?" else ocf_log warn "Attribute ${NAME} is set to ${VALUE}, but this file is not executable" fi fi return $OCF_SUCCESS } # # saphostctrl_installed # saphostctrl_installed() { OCF_RESKEY_DIR_EXECUTABLE_default="/usr/sap/hostctrl/exe" : ${OCF_RESKEY_DIR_EXECUTABLE=${OCF_RESKEY_DIR_EXECUTABLE_default}} SAPHOSTCTRL="${OCF_RESKEY_DIR_EXECUTABLE}/saphostctrl" SAPHOSTEXEC="${OCF_RESKEY_DIR_EXECUTABLE}/saphostexec" SAPHOSTSRV="${OCF_RESKEY_DIR_EXECUTABLE}/sapstartsrv" SAPHOSTOSCOL="${OCF_RESKEY_DIR_EXECUTABLE}/saposcol" have_binary $SAPHOSTCTRL && have_binary $SAPHOSTEXEC } # # 'main' starts here... # if ( [ $# -ne 1 ] ) then usage exit $OCF_ERR_ARGS fi # These operations don't require OCF instance parameters to be set case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage) usage exit $OCF_SUCCESS;; methods) sapdatabase_methods exit $?;; *);; esac if ! ocf_is_root then ocf_log err "$0 must be run as root" exit $OCF_ERR_PERM fi # mandatory parameter check if [ -z "$OCF_RESKEY_SID" ]; then ocf_log err "Please set OCF_RESKEY_SID to the SAP system id!" exit $OCF_ERR_ARGS fi SID=`echo "$OCF_RESKEY_SID"` if [ -z "$OCF_RESKEY_DBTYPE" ]; then ocf_log err "Please set OCF_RESKEY_DBTYPE to the database vendor specific tag (ADA,DB6,ORA,SYB,HDB)!" exit $OCF_ERR_ARGS fi DBTYPE=`echo "$OCF_RESKEY_DBTYPE" | tr '[:lower:]' '[:upper:]'` # source functions and initialize global variables if saphostctrl_installed; then . ${OCF_FUNCTIONS_DIR}/sapdb.sh else + if [ -n "${OCF_RESKEY_DBOSUSER}" ]; then + ocf_exit_reason "Usage of parameter OCF_RESKEY_DBOSUSER is not possible without having SAP Host-Agent installed" + exit $OCF_ERR_ARGS + fi . ${OCF_FUNCTIONS_DIR}/sapdb-nosha.sh fi sapdatabase_init # we always want to fall to the faster status method in case of a probe by the cluster ACTION=$1 if ocf_is_probe then ACTION=status fi # What kind of method was invoked? case "$ACTION" in start|stop|status|recover) sapdatabase_$ACTION exit $?;; monitor) sapdatabase_monitor $OCF_RESKEY_STRICT_MONITORING exit $?;; validate-all) sapdatabase_validate exit $?;; *) sapdatabase_methods exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/SAPInstance b/heartbeat/SAPInstance index da394f5a1..be2ff3054 100755 --- a/heartbeat/SAPInstance +++ b/heartbeat/SAPInstance @@ -1,942 +1,943 @@ #!/bin/sh # # SAPInstance # # Description: Manages a single SAP Instance as a High-Availability # resource. One SAP Instance is defined by one # SAP Instance-Profile. start/stop handels all services # of the START-Profile, status and monitor care only # about essential services. # # Author: Alexander Krauth, June 2006 # Support: linux@sap.com # License: GNU General Public License (GPL) # Copyright: (c) 2006-2008 Alexander Krauth # # An example usage: # See usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_InstanceName # OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) # OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_START_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_START_WAITTIME (optional, to solve timing problems during J2EE-Addin start) # OCF_RESKEY_AUTOMATIC_RECOVER (optional, automatic startup recovery using cleanipc, default is false) # OCF_RESKEY_MONITOR_SERVICES (optional, default is to monitor critical services only) # OCF_RESKEY_SHUTDOWN_METHOD (optional, defaults to NORMAL, KILL: terminate the SAP instance with OS commands - faster, at your own risk) # OCF_RESKEY_ERS_InstanceName (optional, InstanceName of the ERS instance in a Master/Slave configuration) # OCF_RESKEY_ERS_START_PROFILE (optional, START_PROFILE of the ERS instance in a Master/Slave configuration) # OCF_RESKEY_PRE_START_USEREXIT (optional, lists a script which can be executed before the resource is started) # OCF_RESKEY_POST_START_USEREXIT (optional, lists a script which can be executed after the resource is started) # OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be executed before the resource is stopped) # OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be executed after the resource is stopped) # # TODO: - Option to shutdown sapstartsrv for non-active instances -> that means: do probes only with OS tools (sapinstance_status) # - Option for better standalone enqueue server monitoring, using ensmon (test enque-deque) # - Option for cleanup abandoned enqueue replication tables # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### SH=/bin/sh sapinstance_usage() { methods=`sapinstance_methods` methods=`echo $methods | tr ' ' '|'` cat <<-! usage: $0 ($methods) $0 manages a SAP Instance as an HA resource. The 'start' operation starts the instance or the ERS instance in a Master/Slave configuration The 'stop' operation stops the instance The 'status' operation reports whether the instance is running The 'monitor' operation reports whether the instance seems to be working The 'promote' operation starts the primary instance in a Master/Slave configuration The 'demote' operation stops the primary instance and starts the ERS instance The 'notify' operation always returns SUCCESS The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports ! } sapinstance_meta_data() { cat < 2.14 Manages a SAP instance as an HA resource. Usually a SAP system consists of one database and at least one or more SAP instances (sometimes called application servers). One SAP Instance is defined by having exactly one instance profile. The instance profiles can usually be found in the directory /sapmnt/SID/profile. Each instance must be configured as it's own resource in the cluster configuration. The resource agent supports the following SAP versions: -- SAP WebAS ABAP Release 6.20 - 7.30 -- SAP WebAS Java Release 6.40 - 7.30 -- SAP WebAS ABAP + Java Add-In Release 6.20 - 7.30 (Java is not monitored by the cluster in that case) +- SAP WebAS ABAP Release 6.20 - 7.40 +- SAP WebAS Java Release 6.40 - 7.40 +- SAP WebAS ABAP + Java Add-In Release 6.20 - 7.40 (Java is not monitored by the cluster in that case) When using a SAP Kernel 6.40 please check and implement the actions from the section "Manual postprocessing" from SAP note 995116 (http://sdn.sap.com). +Other versions may also work with this agent, but have not been verified. All operations of the SAPInstance resource agent are done by using the startup framework called SAP Management Console or sapstartsrv that was introduced with SAP kernel release 6.40. Find more information about the SAP Management Console in SAP note 1014480. Using this framework defines a clear interface for the Heartbeat cluster, how it sees the SAP system. The options for monitoring the SAP system are also much better than other methods like just watching the ps command for running processes or doing some pings to the application. sapstartsrv uses SOAP messages to request the status of running SAP processes. Therefore it can actually ask a process itself what it's status is, independent from other problems that might exist at the same time. sapstartsrv knows 4 status colours: - GREEN = everything is fine - YELLOW = something is wrong, but the service is still working - RED = the service does not work - GRAY = the service has not been started The SAPInstance resource agent will interpret GREEN and YELLOW as OK. That means that minor problems will not be reported to the Heartbeat cluster. This prevents the cluster from doing an unwanted failover. The statuses RED and GRAY are reported as NOT_RUNNING to the cluster. Depending on the status the cluster expects from the resource, it will do a restart, failover or just nothing. The full qualified SAP instance name. e.g. P01_DVEBMGS00_sapp01ci. Usually this is the name of the SAP instance profile. Instance name: SID_INSTANCE_VIR-HOSTNAME The full qualified path where to find sapstartsrv and sapcontrol. Specify this parameter, if you have changed the SAP kernel directory location after the default SAP installation. Path of sapstartsrv and sapcontrol The full qualified path where to find the SAP START profile. Specify this parameter, if you have changed the SAP profile directory location after the default SAP installation. Path of start profile The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than. Start profile name After that time in seconds a monitor operation is executed by the resource agent. Does the monitor return SUCCESS, the start ishandled as SUCCESS. This is useful to resolve timing problems with e.g. the J2EE-Addin instance.Usually the resource agent waits until all services are started and the SAP Management Console reports a GREEN status. A double stack installation (ABAP + Java AddIn) consists of an ABAP dispatcher and aJAVA instance. Normally the start of the JAVA instance takes much longer than the start of the ABAP instance. For a JAVA Instance you may need to configure a much higher timeout for the start operation of the resource in Heartbeat. The disadvantage here is, that the discovery of a failed start by the cluster takes longer. Somebody might say: For me it is important, that the ABAP instance is up and running. A failure of the JAVA instance shall not cause a failover of the SAP instance. Actually the SAP MC reports a YELLOW status, if the JAVA instance of a double stack system fails. From the resource agent point of view YELLOW means:everything is OK. Setting START_WAITTIME to a lower value determines the resource agent to check the status of the instance during a start operation after that time. As it would wait normally for a GREEN status, now it reports SUCCESS to the cluster in case of a YELLOW status already after the specified time. That is only useful for double stack systems. Check the successful start after that time (do not wait for J2EE-Addin) The SAPInstance resource agent tries to recover a failed start attempt automaticaly one time. This is done by killing runing instance processes, removing the kill.sap file and executing cleanipc. Sometimes a crashed SAP instance leaves some processes and/or shared memory segments behind. Setting this option to true will try to remove those leftovers during a start operation. That is to reduce manual work for the administrator. Enable or disable automatic startup recovery Within a SAP instance there can be several services. Usually you will find the defined services in the START profile of the related instance (Attention: with SAP Release 7.10 the START profile content was moved to the instance profile). Not all of those services are worth to monitor by the cluster. For example you properly do not like to failover your SAP instance, if the central syslog collector daemon fails. Those services are monitored within the SAPInstance resource agent: - disp+work - msg_server - enserver - enrepserver - jcontrol - jstart That names match the strings used in the output of the command 'sapcontrol -nr [Instance-Nr] -function GetProcessList'. The default should fit most cases where you want to manage a SAP Instance from the cluster. You may change this with this parameter, if you like to monitor more/less or other services that sapstartsrv supports. You may specify multiple services seperated by a | (pipe) sign in this parameter: disp+work|msg_server|enserver Services to monitor Usual a SAP Instance is stopped by the command 'sapcontrol -nr InstanceNr -function Stop'. SHUTDOWN_METHOD=KILL means to kill the SAP Instance using OS commands. SAP processes of the instance are terminated with 'kill -9', shared memory is deleted with 'cleanipc' and the 'kill.sap' file will be deleted. That method is much faster than the gracefull stop, but the instance does not have the chance to say goodbye to other SAPinstances in the same system. USE AT YOUR OWN RISK !! Shutdown graceful or kill a SAP instance by terminating the processes. (normal|KILL) Only used in a Master/Slave resource configuration: The full qualified SAP enqueue replication instance name. e.g. P01_ERS02_sapp01ers. Usually this is the name of the SAP instance profile. The enqueue replication instance must be installed, before you want to configure a master-slave cluster recource. The master-slave configuration in the cluster must use this properties: clone_max = 2 clone_node_max = 1 master_node_max = 1 master_max = 1 Enqueue replication instance name: SID_INSTANCE_VIR-HOSTNAME Only used in a Master/Slave resource configuration: The parameter ERS_InstanceName must also be set in this configuration. The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than. Enqueue replication start profile name The full qualified path where to find a script or program which should be executed before this resource gets started. Path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got started. Path to a post-start script The full qualified path where to find a script or program which should be executed before this resource gets stopped. Path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got stopped. Path to a post-start script END } # # methods: What methods/operations do we support? # sapinstance_methods() { cat <<-! start stop status monitor promote demote notify validate-all methods meta-data usage ! } # # is_clone : find out if we are configured to run in a Master/Slave configuration # is_clone() { if [ -n "$OCF_RESKEY_CRM_meta_clone_max" ] \ && [ "$OCF_RESKEY_CRM_meta_clone_max" -gt 0 ] then if [ "$OCF_RESKEY_CRM_meta_clone_max" -ne 2 ] || \ [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] || \ [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] || \ [ "$OCF_RESKEY_CRM_meta_master_max" -ne 1 ] then ocf_log err "Clone options misconfigured. (expect: clone_max=2,clone_node_max=1,master_node_max=1,master_max=1)" exit $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_ERS_InstanceName" ] then ocf_log err "In a Master/Slave configuration the ERS_InstanceName parameter is mandatory." exit $OCF_ERR_ARGS fi else return 0 fi return 1 } # # abnormal_end : essential things are missing, but in the natur of a SAP installation - which can be very different # from customer to customer - we cannot handle this always as an error # This would be the case, if the software is installed on shared disks and not visible # to all cluster nodes at all times. # abnormal_end() { local err_msg=$1 ocf_is_probe && { sapinstance_status exit $? } if [ "$ACTION" = "stop" ] then cleanup_instance exit $OCF_SUCCESS fi ocf_log err $err_msg exit $OCF_ERR_CONFIGURED } # # sapinstance_init : Define global variables with default values, if optional parameters are not set # # sapinstance_init() { local myInstanceName="$1" SID=`echo "$myInstanceName" | cut -d_ -f1` InstanceName=`echo "$myInstanceName" | cut -d_ -f2` InstanceNr=`echo "$InstanceName" | sed 's/.*\([0-9][0-9]\)$/\1/'` SAPVIRHOST=`echo "$myInstanceName" | cut -d_ -f3` # optional OCF parameters, we try to guess which directories are correct if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] then if have_binary /usr/sap/$SID/$InstanceName/exe/sapstartsrv && have_binary /usr/sap/$SID/$InstanceName/exe/sapcontrol then DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" SAPSTARTSRV="/usr/sap/$SID/$InstanceName/exe/sapstartsrv" SAPCONTROL="/usr/sap/$SID/$InstanceName/exe/sapcontrol" elif have_binary /usr/sap/$SID/SYS/exe/run/sapstartsrv && have_binary /usr/sap/$SID/SYS/exe/run/sapcontrol then DIR_EXECUTABLE="/usr/sap/$SID/SYS/exe/run" SAPSTARTSRV="/usr/sap/$SID/SYS/exe/run/sapstartsrv" SAPCONTROL="/usr/sap/$SID/SYS/exe/run/sapcontrol" fi else if have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" && have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" then DIR_EXECUTABLE="$OCF_RESKEY_DIR_EXECUTABLE" SAPSTARTSRV="$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" SAPCONTROL="$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" fi fi sidadm="`echo $SID | tr '[:upper:]' '[:lower:]'`adm" [ -z "$DIR_EXECUTABLE" ] && abnormal_end "Cannot find sapstartsrv and sapcontrol executable, please set DIR_EXECUTABLE parameter!" if [ -z "$OCF_RESKEY_DIR_PROFILE" ] then DIR_PROFILE="/usr/sap/$SID/SYS/profile" else DIR_PROFILE="$OCF_RESKEY_DIR_PROFILE" fi if [ "$myInstanceName" != "$OCF_RESKEY_InstanceName" ] then currentSTART_PROFILE=$OCF_RESKEY_ERS_START_PROFILE else currentSTART_PROFILE=$OCF_RESKEY_START_PROFILE fi if [ -z "$currentSTART_PROFILE" ] then SAPSTARTPROFILE="$DIR_PROFILE/START_${InstanceName}_${SAPVIRHOST}" else SAPSTARTPROFILE="$currentSTART_PROFILE" fi if [ -z "$OCF_RESKEY_START_WAITTIME" ] then export OCF_RESKEY_START_WAITTIME=3600 fi if [ -z "$OCF_RESKEY_MONITOR_SERVICES" ] then export OCF_RESKEY_MONITOR_SERVICES="disp+work|msg_server|enserver|enrepserver|jcontrol|jstart" fi # as root user we need the library path to the SAP kernel to be able to call sapcontrol if [ `echo $LD_LIBRARY_PATH | grep -c "^$DIR_EXECUTABLE\>"` -eq 0 ]; then LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH export LD_LIBRARY_PATH fi return $OCF_SUCCESS } # # check_sapstartsrv : Before using sapcontrol we make sure that the sapstartsrv is running for the correct instance. # We cannot use sapinit and the /usr/sap/sapservices file in case of an enquerep instance, # because then we have two instances with the same instance number. # check_sapstartsrv() { local restart=0 local runninginst="" local chkrc=$OCF_SUCCESS local output="" if [ ! -S /tmp/.sapstream5${InstanceNr}13 ]; then ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName (no UDS), it will be started now" restart=1 else output=`$SAPCONTROL -nr $InstanceNr -function ParameterValue INSTANCE_NAME -format script` if [ $? -eq 0 ] then runninginst=`echo "$output" | grep '^0 : ' | cut -d' ' -f3` if [ "$runninginst" != "$InstanceName" ] then ocf_log warn "sapstartsrv is running for instance $runninginst, that service will be killed" restart=1 else output=`$SAPCONTROL -nr $InstanceNr -function AccessCheck Start` if [ $? -ne 0 ]; then ocf_log warn "FAILED : sapcontrol -nr $InstanceNr -function AccessCheck Start (`ls -ld1 /tmp/.sapstream5${InstanceNr}13`)" ocf_log warn "sapstartsrv will be restarted to try to solve this situation, otherwise please check sapstsartsrv setup (SAP Note 927637)" restart=1 fi fi else ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName, it will be started now" restart=1 fi fi if [ -z "$runninginst" ]; then runninginst=$InstanceName; fi if [ $restart -eq 1 ] then if [ -d /usr/sap/$SID/SYS/profile/ ] then DIR_PROFILE="/usr/sap/$SID/SYS/profile" else abnormal_end "Expected /usr/sap/$SID/SYS/profile/ to be a directory, please set DIR_PROFILE parameter!" fi [ ! -r $SAPSTARTPROFILE ] && abnormal_end "Expected $SAPSTARTPROFILE to be the instance START profile, please set START_PROFILE parameter!" pkill -9 -f "sapstartsrv.*$runninginst" # removing the unix domain socket files as they might have wrong permissions # or ownership - they will be recreated by sapstartsrv during next start rm -f /tmp/.sapstream5${InstanceNr}13 rm -f /tmp/.sapstream5${InstanceNr}14 $SAPSTARTSRV pf=$SAPSTARTPROFILE -D -u $sidadm # now make sure the daemon has been started and is able to respond local srvrc=1 while [ $srvrc -eq 1 -a `pgrep -f "sapstartsrv.*$runninginst" | wc -l` -gt 0 ] do sleep 1 $SAPCONTROL -nr $InstanceNr -function GetProcessList > /dev/null 2>&1 srvrc=$? done if [ $srvrc -ne 1 ] then ocf_log info "sapstartsrv for instance $SID-$InstanceName was restarted !" chkrc=$OCF_SUCCESS else ocf_log error "sapstartsrv for instance $SID-$InstanceName could not be started!" chkrc=$OCF_ERR_GENERIC ocf_is_probe && chkrc=$OCF_NOT_RUNNING fi fi return $chkrc } # # sapuserexit : Many SAP customers need some additional processes/tools to run their SAP systems. # This specialties do not allow a totally generic SAP cluster resource agent. # Someone should write a resource agent for each additional process you need, if it # is required to monitor that process within the cluster manager. To enable # you to extent this resource agent without developing a new one, this user exit # was introduced. # sapuserexit() { local NAME="$1" local VALUE="$2" if [ -n "$VALUE" ] then if have_binary "$VALUE" then ocf_log info "Calling userexit ${NAME} with customer script file ${VALUE}" "$VALUE" >/dev/null 2>&1 ocf_log info "Exiting userexit ${NAME} with customer script file ${VALUE}, returncode: $?" else ocf_log warn "Attribute ${NAME} is set to ${VALUE}, but this file is not executable" fi fi return 0 } # # cleanup_instance : remove resources (processes and shared memory) from a crashed instance) # cleanup_instance() { pkill -9 -f -U $sidadm $InstanceName ocf_log info "Terminated instance using 'pkill -9 -f -U $sidadm $InstanceName'" # it is necessary to call cleanipc as user sidadm if the system has 'vmcj/enable = ON' set - otherwise SHM-segments in /dev/shm/SAP_ES2* cannot beremoved su - $sidadm -c "cleanipc $InstanceNr remove" ocf_log info "Tried to remove shared memory resources using 'cleanipc $InstanceNr remove' as user $sidadm" ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/kill.sap ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/shutdown.sap ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgcpid ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgspid return 0 } # # sapinstance_start : Start the SAP instance # sapinstance_start() { sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT" local rc=$OCF_NOT_RUNNING local output="" local loopcount=0 while [ $loopcount -lt 2 ] do loopcount=$(($loopcount + 1)) check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ]; then output=`$SAPCONTROL -nr $InstanceNr -function Start` rc=$? ocf_log info "Starting SAP Instance $SID-$InstanceName: $output" fi if [ $rc -ne 0 ] then ocf_log err "SAP Instance $SID-$InstanceName start failed." return $OCF_ERR_GENERIC fi local startrc=1 while [ $startrc -gt 0 ] do local waittime_start=`date +%s` output=`$SAPCONTROL -nr $InstanceNr -function WaitforStarted $OCF_RESKEY_START_WAITTIME 10` startrc=$? local waittime_stop=`date +%s` if [ $startrc -ne 0 ] then if [ $(($waittime_stop - $waittime_start)) -ge $OCF_RESKEY_START_WAITTIME ] then sapinstance_monitor NOLOG if [ $? -eq $OCF_SUCCESS ] then output="START_WAITTIME ($OCF_RESKEY_START_WAITTIME) has elapsed, but instance monitor returned SUCCESS. Instance considered running." startrc=0; loopcount=2 fi else if [ $loopcount -eq 1 ] && ocf_is_true $OCF_RESKEY_AUTOMATIC_RECOVER then ocf_log warn "SAP Instance $SID-$InstanceName start failed: $output" ocf_log warn "Try to recover $SID-$InstanceName" cleanup_instance else loopcount=2 fi startrc=-1 fi else loopcount=2 fi done done if [ $startrc -eq 0 ] then ocf_log info "SAP Instance $SID-$InstanceName started: $output" rc=$OCF_SUCCESS sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT" else ocf_log err "SAP Instance $SID-$InstanceName start failed: $output" rc=$OCF_NOT_RUNNING fi return $rc } # # sapinstance_recover: Try startup of failed instance by cleaning up resources # sapinstance_recover() { cleanup_instance sapinstance_start return $? } # # sapinstance_stop: Stop the SAP instance # sapinstance_stop() { local output="" local rc sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT" if [ "$OCF_RESKEY_SHUTDOWN_METHOD" = "KILL" ] then ocf_log info "Stopping SAP Instance $SID-$InstanceName with shutdown method KILL!" cleanup_instance return $OCF_SUCCESS fi check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ]; then output=`$SAPCONTROL -nr $InstanceNr -function Stop` rc=$? ocf_log info "Stopping SAP Instance $SID-$InstanceName: $output" fi if [ $rc -eq 0 ] then output=`$SAPCONTROL -nr $InstanceNr -function WaitforStopped 3600 1` if [ $? -eq 0 ] then ocf_log info "SAP Instance $SID-$InstanceName stopped: $output" rc=$OCF_SUCCESS else ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" rc=$OCF_ERR_GENERIC fi else ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" rc=$OCF_ERR_GENERIC fi sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT" return $rc } # # sapinstance_monitor: Can the given SAP instance do anything useful? # sapinstance_monitor() { local MONLOG=$1 local rc check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ] then local count=0 local SERVNO local output output=`$SAPCONTROL -nr $InstanceNr -function GetProcessList -format script` # we have to parse the output, because the returncode doesn't tell anything about the instance status for SERVNO in `echo "$output" | grep '^[0-9] ' | cut -d' ' -f1 | sort -u` do local COLOR=`echo "$output" | grep "^$SERVNO dispstatus: " | cut -d' ' -f3` local SERVICE=`echo "$output" | grep "^$SERVNO name: " | cut -d' ' -f3` local STATE=0 local SEARCH case $COLOR in GREEN|YELLOW) STATE=$OCF_SUCCESS;; *) STATE=$OCF_NOT_RUNNING;; esac SEARCH=`echo "$OCF_RESKEY_MONITOR_SERVICES" | sed 's/\+/\\\+/g' | sed 's/\./\\\./g'` if [ `echo "$SERVICE" | egrep -c "$SEARCH"` -eq 1 ] then if [ $STATE -eq $OCF_NOT_RUNNING ] then [ "$MONLOG" != "NOLOG" ] && ocf_log err "SAP instance service $SERVICE is not running with status $COLOR !" rc=$STATE fi count=1 fi done if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ] then if ocf_is_probe then rc=$OCF_NOT_RUNNING else [ "$MONLOG" != "NOLOG" ] && ocf_log err "The SAP instance does not run any services which this RA could monitor!" rc=$OCF_ERR_GENERIC fi fi fi return $rc } # # sapinstance_status: Lightweight check of SAP instance only with OS tools # sapinstance_status() { local pid local pids [ ! -f "/usr/sap/$SID/$InstanceName/work/kill.sap" ] && return $OCF_NOT_RUNNING pids=`grep '^kill -[0-9]' /usr/sap/$SID/$InstanceName/work/kill.sap | awk '{print $3}'` for pid in $pids do [ `pgrep -f -U $sidadm $InstanceName | grep -c $pid` -gt 0 ] && return $OCF_SUCCESS done return $OCF_NOT_RUNNING } # # sapinstance_validate: Check the symantic of the input parameters # sapinstance_validate() { local rc=$OCF_SUCCESS if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$SID' is not a valid system ID!" rc=$OCF_ERR_ARGS fi if [ `echo "$InstanceName" | grep -c '^[A-Z].*[0-9][0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$InstanceName' is not a valid instance name!" rc=$OCF_ERR_ARGS fi if [ `echo "$InstanceNr" | grep -c '^[0-9][0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$InstanceNr' is not a valid instance number!" rc=$OCF_ERR_ARGS fi if [ `echo "$SAPVIRHOST" | grep -c '^[A-Za-z][A-Za-z0-9_-]*$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$SAPVIRHOST' is not a valid hostname!" rc=$OCF_ERR_ARGS fi return $rc } # # sapinstance_start_clone # sapinstance_start_clone() { sapinstance_init $OCF_RESKEY_ERS_InstanceName ${HA_SBIN_DIR}/crm_master -v 50 -l reboot sapinstance_start return $? } # # sapinstance_stop_clone # sapinstance_stop_clone() { sapinstance_init $OCF_RESKEY_ERS_InstanceName ${HA_SBIN_DIR}/crm_master -v 0 -l reboot sapinstance_stop return $? } # # sapinstance_monitor_clone # sapinstance_monitor_clone() { # first check with the status function (OS tools) if there could be something like a SAP instance running # as we do not know here, if we are in master or slave state we do not want to start our monitoring # agents (sapstartsrv) on the wrong host local rc sapinstance_init $OCF_RESKEY_InstanceName if sapinstance_status; then if sapinstance_monitor; then ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot return $OCF_RUNNING_MASTER fi # by nature of the SAP enqueue server we have to make sure # that we do a failover to the slave (enqueue replication server) # in case the enqueue process has failed. We signal this to the # cluster by setting our master preference to a lower value than the slave. ${HA_SBIN_DIR}/crm_master -v 10 -l reboot return $OCF_FAILED_MASTER fi sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_status && sapinstance_monitor rc=$? if [ $rc -eq $OCF_SUCCESS ]; then ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot fi return $rc } # # sapinstance_promote_clone: In a Master/Slave configuration get Master by starting the SCS instance and stopping the ERS instance # The order is important here to behave correct from the application levels view # sapinstance_promote_clone() { local rc sapinstance_init $OCF_RESKEY_InstanceName ocf_log info "Promoting $SID-$InstanceName to running Master." sapinstance_start rc=$? if [ $rc -eq $OCF_SUCCESS ]; then sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_stop rc=$? fi return $rc } # # sapinstance_demote_clone: In a Master/Slave configuration get Slave by stopping the SCS instance and starting the ERS instance # sapinstance_demote_clone() { local rc sapinstance_init $OCF_RESKEY_InstanceName ocf_log info "Demoting $SID-$InstanceName to a slave." sapinstance_stop rc=$? if [ $rc -eq $OCF_SUCCESS ]; then sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_start rc=$? fi return $rc } # # sapinstance_notify: Handle master scoring - to make sure a slave gets the next master # sapinstance_notify() { local n_type="$OCF_RESKEY_CRM_meta_notify_type" local n_op="$OCF_RESKEY_CRM_meta_notify_operation" if [ "${n_type}_${n_op}" = "post_promote" ]; then # After promotion of one master in the cluster, we make sure that all clones reset their master # value back to 100. This is because a failed monitor on a master might have degree one clone # instance to score 10. ${HA_SBIN_DIR}/crm_master -v 100 -l reboot elif [ "${n_type}_${n_op}" = "pre_demote" ]; then # if we are a slave and a demote event is anounced, make sure we have the highes wish to became master # that is, when a slave resource was startet after the promote event of a already running master (e.g. node of slave was down) # We also have to make sure to overrule the globaly set resource_stickiness or any fail-count factors => INFINITY local n_uname="$OCF_RESKEY_CRM_meta_notify_demote_uname" if [ ${n_uname} != ${NODENAME} ]; then ${HA_SBIN_DIR}/crm_master -v INFINITY -l reboot fi fi } # # 'main' starts here... # ## GLOBALS SID="" sidadm="" InstanceName="" InstanceNr="" SAPVIRHOST="" DIR_EXECUTABLE="" SAPSTARTSRV="" SAPCONTROL="" DIR_PROFILE="" SAPSTARTPROFILE="" CLONE=0 NODENAME=$(ocf_local_nodename) if ( [ $# -ne 1 ] ) then sapinstance_usage exit $OCF_ERR_ARGS fi ACTION=$1 if [ "$ACTION" = "status" ]; then ACTION=monitor fi # These operations don't require OCF instance parameters to be set case "$ACTION" in usage|methods) sapinstance_$ACTION exit $OCF_SUCCESS;; meta-data) sapinstance_meta_data exit $OCF_SUCCESS;; notify) sapinstance_notify exit $OCF_SUCCESS;; *);; esac if ! ocf_is_root then ocf_log err "$0 must be run as root" exit $OCF_ERR_PERM fi # parameter check if [ -z "$OCF_RESKEY_InstanceName" ] then ocf_log err "Please set OCF_RESKEY_InstanceName to the name to the SAP instance profile!" exit $OCF_ERR_ARGS fi is_clone; CLONE=$? if [ ${CLONE} -eq 1 ] then CLACT=_clone else if [ "$ACTION" = "promote" -o "$ACTION" = "demote" ] then ocf_log err "$ACTION called in a non master/slave environment" exit $OCF_ERR_ARGS fi sapinstance_init $OCF_RESKEY_InstanceName fi # What kind of method was invoked? case "$ACTION" in start|stop|monitor|promote|demote) sapinstance_$ACTION$CLACT exit $?;; validate-all) sapinstance_validate exit $?;; *) sapinstance_methods exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/apache b/heartbeat/apache index 09d5ded49..040da6d1a 100755 --- a/heartbeat/apache +++ b/heartbeat/apache @@ -1,656 +1,698 @@ #!/bin/sh # # High-Availability Apache/IBMhttp control script # # apache (aka IBMhttpd) # # Description: starts/stops apache web servers. # # Author: Alan Robertson # Sun Jiang Dong # # Support: linux-ha@lists.linux-ha.org # # License: GNU General Public License (GPL) # # Copyright: (C) 2002-2005 International Business Machines # # # An example usage in /etc/ha.d/haresources: # node1 10.0.0.170 apache::/opt/IBMHTTPServer/conf/httpd.conf # node1 10.0.0.170 IBMhttpd # # Our parsing of the Apache config files is very rudimentary. # It'll work with lots of different configurations - but not every # possible configuration. # # Patches are being accepted ;-) # # OCF parameters: # OCF_RESKEY_configfile # OCF_RESKEY_httpd # OCF_RESKEY_port # OCF_RESKEY_statusurl # OCF_RESKEY_options # OCF_RESKEY_testregex # OCF_RESKEY_client # OCF_RESKEY_testurl # OCF_RESKEY_testregex10 # OCF_RESKEY_testconffile # OCF_RESKEY_testname # OCF_RESKEY_envfiles : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs . ${OCF_FUNCTIONS_DIR}/apache-conf.sh . ${OCF_FUNCTIONS_DIR}/http-mon.sh HA_VARRUNDIR=${HA_VARRUN} ####################################################################### # # Configuration options - usually you don't need to change these # ####################################################################### # IBMHTTPD=/opt/IBMHTTPServer/bin/httpd HTTPDLIST="/sbin/httpd2 /usr/sbin/httpd2 /usr/sbin/apache2 /sbin/httpd /usr/sbin/httpd /usr/sbin/apache $IBMHTTPD" MPM=/usr/share/apache2/find_mpm if [ -x $MPM ]; then HTTPDLIST="$HTTPDLIST `$MPM 2>/dev/null`" fi LOCALHOST="http://localhost" HTTPDOPTS="-DSTATUS" DEFAULT_IBMCONFIG=/opt/IBMHTTPServer/conf/httpd.conf DEFAULT_SUSECONFIG="/etc/apache2/httpd.conf" DEFAULT_RHELCONFIG="/etc/httpd/conf/httpd.conf" +DEFAULT_DEBIANCONFIG="/etc/apache2/apache2.conf" # # You can also set # HTTPD # PORT # STATUSURL # CONFIGFILE # in this section if what we're doing doesn't work for you... # # End of Configuration options ####################################################################### CMD=`basename $0` # The config-file-pathname is the pathname to the configuration # file for this web server. Various appropriate defaults are # assumed if no config file is specified. If this command is # invoked as *IBM*, then the default config file name is # $DEFAULT_IBMCONFIG, otherwise the default config file # will be either $DEFAULT_RHELCONFIG or $DEFAULT_SUSECONFIG depending # on which is detected. usage() { cat <<-END usage: $0 action action: start start the web server stop stop the web server status return the status of web server, run or down monitor return TRUE if the web server appears to be working. For this to be supported you must configure mod_status and give it a server-status URL. You have to have installed either curl or wget for this to work. meta-data show meta data message validate-all validate the instance parameters END } get_pid() { if [ -f $PidFile ]; then cat $PidFile else false fi } # # return TRUE if a process with given PID is running # ProcessRunning() { local pid=$1 # Use /proc if it looks like it's here... if [ -d /proc -a -d /proc/1 ]; then [ -d /proc/$pid ] else # This assumes we're running as root... kill -s 0 "$pid" >/dev/null 2>&1 fi } silent_status() { local pid pid=`get_pid` if [ -n "$pid" ]; then ProcessRunning $pid else : No pid file false fi } # May be useful to add other distros in future validate_default_config() { if [ -e /etc/SuSE-release ]; then validate_default_suse_config + elif [ -e /etc/debian_version ]; then + validate_default_debian_config else return 0 fi } # When using the default /etc/apache2/httpd.conf on SUSE, the file # /etc/apache2/sysconfig.d/include.conf is required to be present, # but this is only generated if you run the apache init script # (with contents derived from /etc/sysconfig/apache2). So, here, # if we're using the default system config file and it requires # that include, we run "/etc/init.d/apache2 configtest" to ensure # the relevant config is generated and valid. We're also taking # this opportunity to enable mod_status if it's not present. validate_default_suse_config() { if [ "$CONFIGFILE" = "$DEFAULT_SUSECONFIG" ] && \ grep -Eq '^Include[[:space:]]+/etc/apache2/sysconfig.d/include.conf' "$CONFIGFILE" then [ -x "/usr/sbin/a2enmod" ] && ocf_run -q /usr/sbin/a2enmod status # init script style, for crusty old SUSE if [ -e "/etc/init.d/apache2" ]; then ocf_run -q /etc/init.d/apache2 configtest || return 1 # systemd style, for shiny new SUSE elif [ -e "/usr/sbin/start_apache2" ]; then ocf_run -q /usr/sbin/start_apache2 -t || return 1 fi fi return 0 } +# Debian's Default configuration uses a lock directory /var/lock/apache2 +# which is only generated using the lsb init script issues configtest. To +# ensure these default directories are present it's useful to run a configtest +# prior to the resource startup which will create the needed directories +# +# To support multiple apache instances the debian scripts and configs +# obey apache2/envvars. (copy /etc/apache2 -> /etc/apache2-instance) +# adjust (SUFFIX) envvars and set OCF_RESKEY_envfiles +validate_default_debian_config() { + if find /etc/apache2* -name apache2.conf | grep -q "$CONFIGFILE" + then + export APACHE_CONFDIR=$(dirname $CONFIGFILE) + [ -x "/usr/sbin/a2enmod" ] && ocf_run -q /usr/sbin/a2enmod status + ocf_run -q /usr/sbin/apache2ctl configtest || return 1 + fi + return 0 +} + apache_start() { if silent_status then ocf_log info "$CMD already running (pid `get_pid`)" return $OCF_SUCCESS fi validate_default_config || return $OCF_ERR_CONFIGURED - # https://bugs.launchpad.net/ubuntu/+source/apache2/+bug/603211 - [ -d /var/run/apache2 ] || mkdir /var/run/apache2 if [ -z $PIDFILE_DIRECTIVE ]; then ocf_run $HTTPD $HTTPDOPTS $OPTIONS -f $CONFIGFILE else ocf_run $HTTPD $HTTPDOPTS $OPTIONS -f $CONFIGFILE -c "PidFile $PidFile" fi tries=0 while : # wait until the user set timeout do apache_monitor ec=$? if [ $ec -eq $OCF_NOT_RUNNING ] then tries=`expr $tries + 1` ocf_log info "waiting for apache $CONFIGFILE to come up" sleep 1 else break fi done if [ $ec -ne 0 ] && silent_status; then apache_stop fi return $ec } signal_children() { for sig in SIGTERM SIGHUP SIGKILL ; do if pgrep -f $HTTPD.*$CONFIGFILE >/dev/null ; then pkill -$sig -f $HTTPD.*$CONFIGFILE >/dev/null ocf_log info "signal $sig sent to apache children" sleep 1 else break fi done } graceful_stop() { local tries=10 local pid=$1 # Try graceful stop for half timeout period if timeout period is present if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then tries=$((($OCF_RESKEY_CRM_meta_timeout/1000) / 2)) fi ocf_log info "Attempting graceful stop of apache PID $pid" kill -WINCH $pid >/dev/null while ProcessRunning $pid && [ $tries -gt 0 ] do sleep 1 tries=`expr $tries - 1` done if [ $tries -eq 0 ]; then # graceful stop didn't work, process still up. return 1 fi return 0 } kill_stop() { local tries=0 local pid=$1 ocf_log info "Killing apache PID $pid" while ProcessRunning $pid && [ $tries -lt 10 ] do if [ $tries -ne 0 ]; then # don't sleep on the first try sleep 1 fi kill $pid >/dev/null tries=`expr $tries + 1` done } apache_stop() { local ret=$OCF_SUCCESS local pid if ! silent_status; then ocf_log info "$CMD is not running." signal_children return $ret fi pid=`get_pid` graceful_stop $pid if [ $? -ne 0 ]; then kill_stop $pid if ProcessRunning $pid; then ocf_exit_reason "$CMD still running ($pid). Killing pid failed." ret=$OCF_ERR_GENERIC fi fi if [ $ret -eq 0 ]; then ocf_log info "$CMD stopped." fi signal_children return $ret } apache_monitor_10() { - if [ "$TESTCONFFILE" ]; then + if [ -f "$TESTCONFFILE" ] && [ -r "$TESTCONFFILE" ]; then readtestconf < $TESTCONFFILE else test_url="$TESTURL" test_regex="$TESTREGEX10" fi whattorun=`gethttpclient` fixtesturl is_testconf_sane || return $OCF_ERR_CONFIGURED if $whattorun "$test_url" | grep -Ei "$test_regex" > /dev/null then return $OCF_SUCCESS else if ! ocf_is_probe; then ocf_exit_reason "Failed to access httpd status page." fi return $OCF_ERR_GENERIC fi } # If the user has not provided any basic monitoring # information, allow the agent to verify the server is # healthy and capable of processing requests by requesting # the http header of website's index attempt_index_monitor_request() { local indexpage="" if [ -n "$OCF_RESKEY_testregex" ]; then return 1; fi if [ -n "$OCF_RESKEY_testregex10" ]; then return 1; fi if [ -n "$OCF_RESKEY_testurl" ]; then return 1; fi if [ -n "$OCF_RESKEY_statusurl" ]; then return 1; fi if [ -n "$OCF_RESKEY_testconffile" ]; then return 1; fi indexpage=$(buildlocalurl) request_url_header $indexpage if [ $? -ne 0 ]; then return $OCF_ERR_GENERIC fi ocf_log info "Successfully retrieved http header at $indexpage" return 0 } apache_monitor_basic() { if ${ourhttpclient}_func "$STATUSURL" | grep -Ei "$TESTREGEX" > /dev/null then return $OCF_SUCCESS fi attempt_index_monitor_request if [ $? -eq 0 ]; then return $OCF_SUCCESS fi if ! ocf_is_probe; then ocf_exit_reason "Failed to access httpd status page." fi return $OCF_ERR_GENERIC } apache_monitor() { silent_status if [ $? -ne 0 ]; then ocf_log info "$CMD not running" return $OCF_NOT_RUNNING fi ourhttpclient=`findhttpclient` # we'll need one if [ -z "$ourhttpclient" ]; then ocf_exit_reason "could not find a http client; make sure that either wget or curl is available" return $OCF_ERR_INSTALLED fi case `ocf_check_level 10` in 0) apache_monitor_basic;; 10) apache_monitor_10;; esac } detect_default_config() { if [ -f $DEFAULT_SUSECONFIG ]; then echo $DEFAULT_SUSECONFIG + elif [ -f $DEFAULT_DEBIANCONFIG ]; then + echo $DEFAULT_DEBIANCONFIG else echo $DEFAULT_RHELCONFIG fi } apache_meta_data(){ cat < 1.0 This is the resource agent for the Apache Web server. This resource agent operates both version 1.x and version 2.x Apache servers. The start operation ends with a loop in which monitor is repeatedly called to make sure that the server started and that it is operational. Hence, if the monitor operation does not succeed within the start operation timeout, the apache resource will end with an error status. The monitor operation by default loads the server status page which depends on the mod_status module and the corresponding configuration file (usually /etc/apache2/mod_status.conf). Make sure that the server status page works and that the access is allowed *only* from localhost (address 127.0.0.1). See the statusurl and testregex attributes for more details. See also http://httpd.apache.org/ Manages an Apache Web server instance The full pathname of the Apache configuration file. This file is parsed to provide defaults for various other resource agent parameters. configuration file path The full pathname of the httpd binary (optional). httpd binary path A port number that we can probe for status information using the statusurl. This will default to the port number found in the configuration file, or 80, if none can be found in the configuration file. httpd port The URL to monitor (the apache server status page by default). If left unspecified, it will be inferred from the apache configuration file. If you set this, make sure that it succeeds *only* from the localhost (127.0.0.1). Otherwise, it may happen that the cluster complains about the resource being active on multiple nodes. url name Regular expression to match in the output of statusurl. Case insensitive. monitor regular expression Client to use to query to Apache. If not specified, the RA will try to find one on the system. Currently, wget and curl are supported. For example, you can set this parameter to "curl" if you prefer that to wget. http client URL to test. If it does not start with "http", then it's considered to be relative to the Listen address. test url Regular expression to match in the output of testurl. Case insensitive. extended monitor regular expression A file which contains test configuration. Could be useful if you have to check more than one web application or in case sensitive info should be passed as arguments (passwords). Furthermore, using a config file is the only way to specify certain parameters. Please see README.webapps for examples and file description. test configuration file Name of the test within the test configuration file. test name Extra options to apply when starting apache. See man httpd(8). command line options Files (one or more) which contain extra environment variables. If you want to prevent script from reading the default file, set this parameter to empty string. environment settings files We will try to detect if the URL (for monitor) is IPv6, but if that doesn't work set this to true to enforce IPv6. use ipv6 with http clients END return $OCF_SUCCESS } apache_validate_all() { if [ -z "$HTTPD" ]; then ocf_exit_reason "apache httpd program not found" return $OCF_ERR_INSTALLED fi if [ ! -x "$HTTPD" ]; then ocf_exit_reason "HTTPD $HTTPD not found or is not an executable!" return $OCF_ERR_INSTALLED fi if [ ! -f $CONFIGFILE ]; then ocf_exit_reason "Configuration file $CONFIGFILE not found!" return $OCF_ERR_INSTALLED fi + + # validate testconffile/testurl before apache_monitor_10() + if [ -n "$TESTCONFFILE" ]; then + if [ ! -f "$TESTCONFFILE" ] || [ ! -r "$TESTCONFFILE" ]; then + ocf_exit_reason "Configuration file $TESTCONFFILE not found, or not readable." + return $OCF_ERR_INSTALLED + fi + else + if [ -n "$TESTURL" ]; then + # remove leading or trailing spaces/tabs + local temp=$(printf "$TESTURL" | sed -e 's/^[ \t]*//g' -e 's/[ \t]*$//g') + + if [ -z "$temp" ]; then + ocf_exit_reason "testurl: \"$TESTURL\" seems to be an empty string?" + return $OCF_ERR_CONFIGURED + fi + fi + + # FIXME: validate TESTREGEX10 will be needed if empty regex is not allow. + fi + ocf_mkstatedir root 755 `dirname $PidFile` || return $OCF_ERR_INSTALLED return $OCF_SUCCESS } find_httpd_prog() { case $0 in *IBM*) HTTPD=$IBMHTTPD DefaultConfig=$DEFAULT_IBMCONFIG;; *) HTTPD= for h in $HTTPDLIST do if [ -f $h -a -x $h ]; then HTTPD=$h break fi done # Let the user know that the $HTTPD used is not the one (s)he specified via $OCF_RESKEY_httpd if [ "X$OCF_RESKEY_httpd" != X -a "X$HTTPD" != X ]; then ocf_log info "Using $HTTPD as HTTPD" fi DefaultConfig=$(detect_default_config) ;; esac } apache_getconfig() { # these variables are global HTTPD="$OCF_RESKEY_httpd" PORT="$OCF_RESKEY_port" STATUSURL="$OCF_RESKEY_statusurl" CONFIGFILE="$OCF_RESKEY_configfile" OPTIONS="$OCF_RESKEY_options" CLIENT=${OCF_RESKEY_client} TESTREGEX=${OCF_RESKEY_testregex:-''} TESTURL="$OCF_RESKEY_testurl" TESTREGEX10=${OCF_RESKEY_testregex10} TESTCONFFILE="$OCF_RESKEY_testconffile" TESTNAME="$OCF_RESKEY_testname" : ${OCF_RESKEY_envfiles="/etc/apache2/envvars"} source_envfiles $OCF_RESKEY_envfiles if [ "X$HTTPD" = X -o ! -f "$HTTPD" -o ! -x "$HTTPD" ]; then find_httpd_prog fi CONFIGFILE=${CONFIGFILE:-$DefaultConfig} if [ -n "$HTTPD" ]; then httpd_basename=`basename $HTTPD` case $httpd_basename in *-*) httpd_basename=`echo "$httpd_basename" | sed -e 's%\-.*%%'`;; esac fi GetParams $CONFIGFILE } OCF_REQUIRED_PARAMS="" OCF_REQUIRED_BINARIES="" ocf_rarun $* diff --git a/heartbeat/awseip b/heartbeat/awseip new file mode 100755 index 000000000..a1bee44f1 --- /dev/null +++ b/heartbeat/awseip @@ -0,0 +1,247 @@ +#!/bin/sh +# +# +# Manage Elastic IP with Pacemaker +# +# +# Copyright 2016 guessi +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# + +# +# Prerequisites: +# +# - preconfigured AWS CLI running environment (AccessKey, SecretAccessKey, etc.) +# - a reserved secondary private IP address for EC2 instances high availablity +# - IAM user role with the following permissions: +# * DescribeInstances +# * AssociateAddress +# * DisassociateAddress +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# +# Defaults +# +OCF_RESKEY_awscli_default="/usr/bin/awscli" +OCF_RESKEY_api_delay_default="1" + +: ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}} +: ${OCF_RESKEY_api_delay=${OCF_RESKEY_api_delay_default}} + +meta_data() { + cat < + + +1.0 + + +description + + + + + + + +command line tools for aws services + +aws cli tools + + + + + +reserved elastic ip for ec2 instance + +reserved elastic ip for ec2 instance + + + + + +reserved allocation id for ec2 instance + +reserved allocation id for ec2 instance + + + + + +predefined private ip address for ec2 instance + +predefined private ip address for ec2 instance + + + + + +a short delay between API calls, to avoid sending API too quick + +a short delay between API calls + + + + + + + + + + + + + + + + + +END +} + +####################################################################### + +awseip_usage() { + cat < +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# + +# +# Prerequisites: +# +# - preconfigured AWS CLI running environment (AccessKey, SecretAccessKey, etc.) +# - a reserved secondary private IP address for EC2 instances high availablity +# - IAM user role with the following permissions: +# * DescribeInstances +# * AssignPrivateIpAddresses +# * UnassignPrivateIpAddresses +# + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# +# Defaults +# +OCF_RESKEY_awscli_default="/usr/bin/awscli" +OCF_RESKEY_api_delay_default="1" + +: ${OCF_RESKEY_awscli=${OCF_RESKEY_awscli_default}} +: ${OCF_RESKEY_api_delay=${OCF_RESKEY_api_delay_default}} + +meta_data() { + cat < + + +1.0 + + +description + + + + + + + +command line tools for aws services + +aws cli tools + + + + + +reserved secondary private ip for ec2 instance + +reserved secondary private ip for ec2 instance + + + + + +a short delay between API calls, to avoid sending API too quick + +a short delay between API calls + + + + + + + + + + + + + + + + + +END +} + +####################################################################### + +awsvip_usage() { + cat < # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### meta_data() { cat < 1.0 The docker HA resource agent creates and launches a docker container based off a supplied docker image. Containers managed by this agent are both created and removed upon the agent's start and stop actions. Docker container resource agent. The docker image to base this container off of. docker image The name to give the created container. By default this will be that resource's instance name. docker container name Allow the image to be pulled from the configured docker registry when the image does not exist locally. NOTE, this can drastically increase the time required to start the container if the image repository is pulled over the network. Allow pulling non-local images Add options to be appended to the 'docker run' command which is used when creating the container during the start action. This option allows users to do things such as setting a custom entry point and injecting environment variables into the newly created container. Note the '-d' option is supplied regardless of this value to force containers to run in the background. NOTE: Do not explicitly specify the --name argument in the run_opts. This agent will set --name using either the resource's instance or the name provided in the 'name' argument of this agent. run options Specifiy a command to launch within the container once it has initialized. run command Specifiy the full path of a command to launch within the container to check the health of the container. This command must return 0 to indicate that the container is healthy. A non-zero return code will indicate that the container has failed and should be recovered. -The command is executed using nsenter. In the future 'docker exec' will -be used once it is more widely supported. +If 'docker exec' is supported, it is used to execute the command. If not, +nsenter is used. monitor command Kill a container immediately rather than waiting for it to gracefully shutdown force kill Allow the container to be reused after stopping the container. By default containers are removed after stop. With the reuse option containers will persist after the container stops. reuse container END } ####################################################################### REQUIRE_IMAGE_PULL=0 docker_usage() { cat <&1) - rc=$? + if docker exec --help >/dev/null 2>&1; then + out=$(docker exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) + rc=$? + else + out=$(echo "$OCF_RESKEY_monitor_cmd" | nsenter --target $(docker inspect --format {{.State.Pid}} ${CONTAINER}) --mount --uts --ipc --net --pid 2>&1) + rc=$? + fi + if [ $rc -ne 0 ]; then ocf_log info "monitor cmd exit code = $rc" ocf_log info "stdout/stderr: $out" if [ $rc -eq 127 ]; then ocf_exit_reason "monitor_cmd, ${OCF_RESKEY_monitor_cmd} , not found within container." # there is no recovering from this, exit immediately exit $OCF_ERR_ARGS fi rc=$OCF_ERR_GENERIC else ocf_log info "monitor cmd passed: exit code = $rc" fi return $rc } container_exists() { docker inspect --format {{.State.Running}} $CONTAINER | egrep '(true|false)' >/dev/null 2>&1 } remove_container() { if ocf_is_true "$OCF_RESKEY_reuse"; then # never remove the container if we have reuse enabled. return 0 fi container_exists if [ $? -ne 0 ]; then # don't attempt to remove a container that doesn't exist return 0 fi ocf_log notice "Cleaning up inactive container, ${CONTAINER}." ocf_run docker rm $CONTAINER } docker_simple_status() { local val container_exists if [ $? -ne 0 ]; then return $OCF_NOT_RUNNING fi # retrieve the 'Running' attribute for the container val=$(docker inspect --format {{.State.Running}} $CONTAINER 2>/dev/null) if [ $? -ne 0 ]; then #not running as a result of container not being found return $OCF_NOT_RUNNING fi if ocf_is_true "$val"; then # container exists and is running return $OCF_SUCCESS fi return $OCF_NOT_RUNNING } docker_monitor() { local rc=0 docker_simple_status rc=$? if [ $rc -ne 0 ]; then return $rc fi monitor_cmd_exec } docker_start() { local run_opts="-d --name=${CONTAINER}" # check to see if the container has already started docker_simple_status if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS fi if [ -n "$OCF_RESKEY_run_opts" ]; then run_opts="$run_opts $OCF_RESKEY_run_opts" fi if [ $REQUIRE_IMAGE_PULL -eq 1 ]; then ocf_log notice "Beginning pull of image, ${OCF_RESKEY_image}" docker pull "${OCF_RESKEY_image}" if [ $? -ne 0 ]; then ocf_exit_reason "failed to pull image ${OCF_RESKEY_image}" return $OCF_ERR_GENERIC fi fi if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then ocf_log info "starting existing container $CONTAINER." ocf_run docker start $CONTAINER else # make sure any previous container matching our container name is cleaned up first. # we already know at this point it wouldn't be running remove_container ocf_log info "running container $CONTAINER for the first time" ocf_run docker run $run_opts $OCF_RESKEY_image $OCF_RESKEY_run_cmd fi if [ $? -ne 0 ]; then ocf_exit_reason "docker failed to launch container" return $OCF_ERR_GENERIC fi # wait for monitor to pass before declaring that the container is started while true; do docker_simple_status if [ $? -ne $OCF_SUCCESS ]; then ocf_exit_reason "Newly created docker container exited after start" return $OCF_ERR_GENERIC fi monitor_cmd_exec if [ $? -eq $OCF_SUCCESS ]; then ocf_log notice "Container $CONTAINER started successfully" return $OCF_SUCCESS fi ocf_exit_reason "waiting on monitor_cmd to pass after start" sleep 1 done } docker_stop() { local timeout=60 docker_simple_status if [ $? -eq $OCF_NOT_RUNNING ]; then remove_container return $OCF_SUCCESS fi if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000) -10 )) if [ $timeout -lt 10 ]; then timeout=10 fi fi if ocf_is_true "$OCF_RESKEY_force_kill"; then ocf_run docker kill $CONTAINER else ocf_log debug "waiting $timeout second[s] before killing container" ocf_run docker stop -t=$timeout $CONTAINER fi if [ $? -ne 0 ]; then ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." return $OCF_ERR_GENERIC fi remove_container if [ $? -ne 0 ]; then ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } image_exists() { # assume that OCF_RESKEY_name have been validated local IMAGE_NAME="$(echo ${OCF_RESKEY_image} | awk -F':' '{print $1}')" # if no tag was specified, use default "latest" local COLON_FOUND=0 local IMAGE_TAG="latest" COLON_FOUND="$(echo "${OCF_RESKEY_image}" | grep -o ':' | grep -c .)" if [ ${COLON_FOUND} -ne 0 ]; then IMAGE_TAG="$(echo ${OCF_RESKEY_image} | awk -F':' '{print $NF}')" fi # IMAGE_NAME might be following formats: # - image # - repository/image # - docker.io/image (some distro will display "docker.io/" as prefix) docker images | awk '{print $1 ":" $2}' | egrep -q -s "^(docker.io\/)?${IMAGE_NAME}:${IMAGE_TAG}\$" if [ $? -eq 0 ]; then # image found return 0 fi if ocf_is_true "$OCF_RESKEY_allow_pull"; then REQUIRE_IMAGE_PULL=1 ocf_log notice "Image (${OCF_RESKEY_image}) does not exist locally but will be pulled during start" return 0 fi # image not found. return 1 } docker_validate() { check_binary docker if [ -z "$OCF_RESKEY_image" ]; then ocf_exit_reason "'image' option is required" exit $OCF_ERR_CONFIGURED fi if [ -n "$OCF_RESKEY_monitor_cmd" ]; then - ocf_log info "checking for nsenter, which is required when 'monitor_cmd' is specified" - check_binary nsenter + docker exec --help >/dev/null 2>&1 + if [ ! $? ]; then + ocf_log info "checking for nsenter, which is required when 'monitor_cmd' is specified" + check_binary nsenter + fi fi image_exists if [ $? -ne 0 ]; then ocf_exit_reason "base image, ${OCF_RESKEY_image}, could not be found." exit $OCF_ERR_CONFIGURED fi return $OCF_SUCCESS } # TODO : # When a user starts plural clones in a node in globally-unique, a user cannot appoint plural name parameters. # When a user appoints reuse, the resource agent cannot connect plural clones with a container. if ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then if [ -n "$OCF_RESKEY_name" ]; then if [ -n "$OCF_RESKEY_CRM_meta_clone_node_max" ] && [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] then ocf_exit_reason "Cannot make plural clones from the same name parameter." exit $OCF_ERR_CONFIGURED fi if [ -n "$OCF_RESKEY_CRM_meta_master_node_max" ] && [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] then ocf_exit_reason "Cannot make plural master from the same name parameter." exit $OCF_ERR_CONFIGURED fi fi : ${OCF_RESKEY_name=`echo ${OCF_RESOURCE_INSTANCE} | tr ':' '-'`} else : ${OCF_RESKEY_name=${OCF_RESOURCE_INSTANCE}} fi if [ -n "$OCF_RESKEY_container" ]; then # we'll keep the container attribute around for a bit in order not to break # any existing deployments. The 'name' attribute is prefered now though. CONTAINER=$OCF_RESKEY_container ocf_log warn "The 'container' attribute is depreciated" else CONTAINER=$OCF_RESKEY_name fi case $__OCF_ACTION in meta-data) meta_data exit $OCF_SUCCESS;; start) docker_validate docker_start;; stop) docker_stop;; monitor) docker_monitor;; validate-all) docker_validate;; usage|help) docker_usage exit $OCF_SUCCESS ;; *) docker_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac rc=$? ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" exit $rc diff --git a/heartbeat/exportfs b/heartbeat/exportfs index 4b88fa1ed..c6ea920fd 100755 --- a/heartbeat/exportfs +++ b/heartbeat/exportfs @@ -1,434 +1,442 @@ #!/bin/sh # exportfs # # Description: Manages nfs exported file system. # # (c) 2010 Ben Timby, Florian Haas, Dejan Muhamedagic, # and Linux-HA contributors # # License: GNU General Public License v2 (GPLv2) and later ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults OCF_RESKEY_unlock_on_stop_default=1 OCF_RESKEY_wait_for_leasetime_on_stop_default=0 OCF_RESKEY_rmtab_backup_default=".rmtab" : ${OCF_RESKEY_unlock_on_stop=${OCF_RESKEY_unlock_on_stop_default}} : ${OCF_RESKEY_wait_for_leasetime_on_stop=${OCF_RESKEY_wait_for_leasetime_on_stop_default}} : ${OCF_RESKEY_rmtab_backup=${OCF_RESKEY_rmtab_backup_default}} ####################################################################### exportfs_meta_data() { cat < 1.0 Exportfs uses the exportfs command to add/remove nfs exports. It does NOT manage the nfs server daemon. It depends on Linux specific NFS implementation details, so is considered not portable to other platforms yet. Manages NFS exports The client specification allowing remote machines to mount the directory (or directories) over NFS. Client ACL. The options to pass to exportfs for the exported directory or directories. Export options. The directory or directories to be exported using NFS. Multiple directories are separated by white space. The directory or directories to export. The fsid option to pass to exportfs. This can be a unique positive integer, a UUID, or the special string "root" which is functionally identical to numeric fsid of 0. If multiple directories are being exported, then they are assigned ids sequentially starting with this fsid (fsid, fsid+1, fsid+2, ...). Obviously, in that case the fsid must be an integer. 0 (root) identifies the export as the root of an NFSv4 pseudofilesystem -- avoid this setting unless you understand its special status. This value will override any fsid provided via the options parameter. Unique fsid within cluster or starting fsid for multiple exports. Relinquish NFS locks associated with this filesystem when the resource stops. Enabling this parameter is highly recommended unless the path exported by this ${__SCRIPT_NAME} resource is also exported by a different resource. Note: Unlocking is only possible on Linux systems where /proc/fs/nfsd/unlock_filesystem exists and is writable. If your system does not fulfill this requirement (on account of having an nonrecent kernel, for example), you may set this parameter to 0 to silence the associated warning. Unlock filesystem on stop? When stopping (unexporting), wait out the NFSv4 lease time. Only after all leases have expired does the NFS kernel server relinquish all server-side handles on the exported filesystem. If this ${__SCRIPT_NAME} resource manages an export that resides on a mount point designed to fail over along with the NFS export itself, then enabling this parameter will ensure such failover is working properly. Note that when this parameter is set, your stop timeout MUST accommodate for the wait period. This parameter is safe to disable if none of your NFS clients are using NFS version 4 or later. Ride out the NFSv4 lease time on resource stop? Back up those entries from the NFS rmtab that apply to the exported directory, to the specified backup file. The filename is interpreted as relative to the exported directory. This backup is required if clients are connecting to the export via NFSv3 over TCP. Note that a configured monitor operation is required for this functionality. To disable rmtab backups, set this parameter to the special string "none". Location of the rmtab backup, relative to directory. END return $OCF_SUCCESS } exportfs_methods() { cat <<-! start stop status monitor validate-all methods meta-data usage ! } reset_fsid() { CURRENT_FSID=$OCF_RESKEY_fsid } bump_fsid() { CURRENT_FSID=$((CURRENT_FSID+1)) } get_fsid() { echo $CURRENT_FSID } # run a function on all directories forall() { local func=$1 shift 1 local fast_exit="" local dir rc=0 if [ "$2" = fast_exit ]; then fast_exit=1 shift 1 fi reset_fsid for dir in $OCF_RESKEY_directory; do $func $dir "$@" rc=$(($rc | $?)) [ $NUMDIRS -gt 1 ] && bump_fsid [ "$fast_exit" ] && continue [ $rc -ne 0 ] && return $rc done return $rc } backup_rmtab() { local dir=$1 local rmtab_backup rmtab_backup="$dir/${OCF_RESKEY_rmtab_backup}" grep ":$dir:" /var/lib/nfs/rmtab > ${rmtab_backup} } restore_rmtab() { local dir=$1 local rmtab_backup rmtab_backup="$dir/${OCF_RESKEY_rmtab_backup}" if [ -r ${rmtab_backup} ]; then local tmpf=`mktemp` sort -u ${rmtab_backup} /var/lib/nfs/rmtab > $tmpf && install -o root -m 644 $tmpf /var/lib/nfs/rmtab rm -f $tmpf ocf_log debug "Restored `wc -l ${rmtab_backup}` rmtab entries from ${rmtab_backup}." else ocf_log warn "rmtab backup ${rmtab_backup} not found or not readable." fi } exportfs_usage() { cat <" instead of "*" format_exports | grep -q -x -F "$dir " rc=$? fi # log something only for monitors if [ $rc -ne 0 -a "$__OCF_ACTION" = "monitor" ]; then local sev="info" ocf_is_probe || sev="err" ocf_log $sev "$dir not exported to $spec (stopped)." fi return $rc } exportfs_monitor () { - if forall is_exported "${OCF_RESKEY_clientspec}"; then + if ! ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" monitor; then + return $OCF_NOT_RUNNING + fi + + if forall is_exported "$(echo "${OCF_RESKEY_clientspec}" | tr -d '[]')"; then if [ ${OCF_RESKEY_rmtab_backup} != "none" ]; then forall backup_rmtab fi return $OCF_SUCCESS else return $OCF_NOT_RUNNING fi } export_one() { local dir=$1 local opts sep sep="" if [ -n "$OCF_RESKEY_options" ]; then opts="$OCF_RESKEY_options" sep="," fi if echo "$opts" | grep fsid >/dev/null; then #replace fsid in options list opts=`echo "$opts" | sed "s/fsid=[0-9]\+/fsid=$(get_fsid)/g"` else #tack the fsid option onto our options list. opts="${opts}${sep}fsid=$(get_fsid)" fi opts="-o $opts" # if any of directories fails to export we can exit # immediately ocf_run exportfs -v $opts "${OCF_RESKEY_clientspec}:$dir" if [ $? -ne 0 ]; then ocf_exit_reason "exportfs failed - exportfs -v $opts ${OCF_RESKEY_clientspec}:$dir" exit $OCF_ERR_GENERIC fi ocf_log info "directory $dir exported" return $OCF_SUCCESS } exportfs_start () { if exportfs_monitor; then ocf_log debug "already exported" return $OCF_SUCCESS fi ocf_log info "Exporting file system(s) ..." + + ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start forall export_one # Restore the rmtab to ensure smooth NFS-over-TCP failover if [ ${OCF_RESKEY_rmtab_backup} != "none" ]; then forall restore_rmtab fi } unlock_fs() { local dir=$1 local unlockfile unlockfile=/proc/fs/nfsd/unlock_filesystem if [ -w ${unlockfile} ]; then echo "$dir" > ${unlockfile} ocf_log info "Unlocked NFS export $dir" else ocf_log warn "Unable to unlock NFS export $dir, ${unlockfile} not found or not writable" fi } wait_for_leasetime() { local leasetimefile local sleeptime leasetimefile=/proc/fs/nfsd/nfsv4leasetime if [ -r ${leasetimefile} ]; then sleeptime=$((`cat ${leasetimefile}`+2)) ocf_log info "Sleeping ${sleeptime} seconds to accommodate for NFSv4 lease expiry" sleep ${sleeptime}s else ocf_log warn "Unable to read NFSv4 lease time from ${leasetimefile}, file not found or not readable" fi } cleanup_export_cache() { # see if the cache is blocking unexport local contentfile=/proc/net/rpc/nfsd.export/content local fsid_re local i=1 fsid_re="fsid=(echo `forall get_fsid`|sed 's/ /|/g')," while :; do grep -E -q "$fsid_re" $contentfile || break ocf_log info "Cleanup export cache ... (try $i)" ocf_run exportfs -f sleep 0.5 i=$((i + 1)) done } unexport_one() { local dir=$1 ocf_run exportfs -v -u ${OCF_RESKEY_clientspec}:$dir } exportfs_stop () { local rc exportfs_monitor if [ $? -eq $OCF_NOT_RUNNING ]; then ocf_log debug "not exported" return $OCF_SUCCESS fi ocf_log info "Un-exporting file system ..." # Backup the rmtab to ensure smooth NFS-over-TCP failover if [ ${OCF_RESKEY_rmtab_backup} != "none" ]; then forall backup_rmtab fi forall unexport_one rc=$? if ocf_is_true ${OCF_RESKEY_unlock_on_stop}; then forall unlock_fs fi if ocf_is_true ${OCF_RESKEY_wait_for_leasetime_on_stop}; then wait_for_leasetime fi if [ $rc -eq 0 ]; then cleanup_export_cache + ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop + ocf_log info "Un-exported file system(s)" return $OCF_SUCCESS else ocf_exit_reason "Failed to un-export file system(s)" return $OCF_ERR_GENERIC fi } testdir() { if [ ! -d $1 ]; then ocf_is_probe || ocf_log err "$1 does not exist or is not a directory" return 1 fi return 0 } exportfs_validate_all () { if [ $NUMDIRS -gt 1 ] && ! ocf_is_decimal "$OCF_RESKEY_fsid"; then ocf_exit_reason "use integer fsid when exporting multiple directories" return $OCF_ERR_CONFIGURED fi if ! forall testdir; then return $OCF_ERR_INSTALLED fi } # If someone puts a trailing slash at the end of the export directory, # this agent is going to fail in some unexpected ways due to how # export strings are matched. The simplest solution here is to strip off # a trailing '/' in the directory before processing anything. newdir=$(echo "$OCF_RESKEY_directory" | sed -n -e 's/^\(.*\)\/$/\1/p') if [ -n "$newdir" ]; then OCF_RESKEY_directory=$newdir fi NUMDIRS=`echo "$OCF_RESKEY_directory" | wc -w` OCF_REQUIRED_PARAMS="directory fsid clientspec" OCF_REQUIRED_BINARIES="exportfs" ocf_rarun $* diff --git a/heartbeat/galera b/heartbeat/galera index 7be2b00b1..543200d59 100755 --- a/heartbeat/galera +++ b/heartbeat/galera @@ -1,800 +1,977 @@ #!/bin/sh # # Copyright (c) 2014 David Vossel # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ## # README. # # This agent only supports being configured as a multistate Master # resource. # # Slave vs Master role: # # During the 'Slave' role, galera instances are in read-only mode and # will not attempt to connect to the cluster. This role exists as # a means to determine which galera instance is the most up-to-date. The # most up-to-date node will be used to bootstrap a galera cluster that # has no current members. # # The galera instances will only begin to be promoted to the Master role # once all the nodes in the 'wsrep_cluster_address' connection address # have entered read-only mode. At that point the node containing the # database that is most current will be promoted to Master. # # Once the first Master instance bootstraps the galera cluster, the # other nodes will join the cluster and start synchronizing via SST. # They will stay in Slave role as long as the SST is running. Their # promotion to Master will happen once synchronization is finished. # # Example: Create a galera cluster using nodes rhel7-node1 rhel7-node2 rhel7-node3 # # pcs resource create db galera enable_creation=true \ # wsrep_cluster_address="gcomm://rhel7-auto1,rhel7-auto2,rhel7-auto3" meta master-max=3 --master # # By setting the 'enable_creation' option, the database will be automatically # generated at startup. The meta attribute 'master-max=3' means that all 3 # nodes listed in the wsrep_cluster_address list will be allowed to connect # to the galera cluster and perform replication. # # NOTE: If you have more nodes in the pacemaker cluster then you wish # to have in the galera cluster, make sure to use location contraints to prevent # pacemaker from attempting to place a galera instance on a node that is # not in the 'wsrep_cluster_address" list. # ## ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs . ${OCF_FUNCTIONS_DIR}/mysql-common.sh # It is common for some galera instances to store # check user that can be used to query status # in this file if [ -f "/etc/sysconfig/clustercheck" ]; then . /etc/sysconfig/clustercheck +elif [ -f "/etc/default/clustercheck" ]; then + . /etc/default/clustercheck fi ####################################################################### usage() { cat < 1.0 Resource script for managing galara database. Manages a galara instance Location of the MySQL server binary MySQL server binary Location of the MySQL client binary MySQL client binary Configuration file MySQL config Directory containing databases MySQL datadir User running MySQL daemon MySQL user Group running MySQL daemon (for logfile and directory permissions) MySQL group The logfile to be used for mysqld. MySQL log file The pidfile to be used for mysqld. MySQL pid file The socket to be used for mysqld. MySQL socket If the MySQL database does not exist, it will be created Create the database if it does not exist Additional parameters which are passed to the mysqld on startup. (e.g. --skip-external-locking or --skip-grant-tables) Additional parameters to pass to mysqld The galera cluster address. This takes the form of: gcomm://node,node,node Only nodes present in this node list will be allowed to start a galera instance. It is expected that the galera node names listed in this address match valid pacemaker node names. Galera cluster address Cluster check user. MySQL test user Cluster check user password check password END } get_option_variable() { local key=$1 $MYSQL $MYSQL_OPTIONS_CHECK -e "SHOW VARIABLES like '$key';" | tail -1 } get_status_variable() { local key=$1 $MYSQL $MYSQL_OPTIONS_CHECK -e "show status like '$key';" | tail -1 } set_bootstrap_node() { local node=$1 ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" -v "true" } clear_bootstrap_node() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" -D } is_bootstrap() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" -Q 2>/dev/null } +set_no_grastate() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" -v "true" +} + +clear_no_grastate() +{ + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" -D +} + +is_no_grastate() +{ + local node=$1 + ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" -Q 2>/dev/null +} + clear_last_commit() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -D } set_last_commit() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -v $1 } get_last_commit() { local node=$1 if [ -z "$node" ]; then ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -Q 2>/dev/null else ${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" -Q 2>/dev/null fi } wait_for_sync() { local state=$(get_status_variable "wsrep_local_state") ocf_log info "Waiting for database to sync with the cluster. " while [ "$state" != "4" ]; do sleep 1 state=$(get_status_variable "wsrep_local_state") done ocf_log info "Database synced." } set_sync_needed() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-sync-needed" -v "true" } clear_sync_needed() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-sync-needed" -D } check_sync_needed() { ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-sync-needed" -Q 2>/dev/null } + +# this function is called when attribute sync-needed is set in the CIB check_sync_status() { - local state=$(get_status_variable "wsrep_local_state") - local ready=$(get_status_variable "wsrep_ready") + # if the pidfile is created, mysqld is up and running + # an IST might still be in progress, check wsrep status + if [ -e $OCF_RESKEY_pid ]; then + local cluster_status=$(get_status_variable "wsrep_cluster_status") + local state=$(get_status_variable "wsrep_local_state") + local ready=$(get_status_variable "wsrep_ready") + + if [ -z "$cluster_status" -o -z "$state" -o -z "$ready" ]; then + ocf_exit_reason "Unable to retrieve state transfer status, verify check_user '$OCF_RESKEY_check_user' has permissions to view status" + return $OCF_ERR_GENERIC + fi - if [ -z "$state" -o -z "$ready" ]; then - ocf_exit_reason "Unable to retrieve state transfer status, verify check_user '$OCF_RESKEY_check_user' has permissions to view status" - return $OCF_ERR_GENERIC - fi + if [ "$cluster_status" != "Primary" ]; then + ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state." + return $OCF_ERR_GENERIC + fi - if [ "$state" == "4" -a "$ready" == "ON" ]; then - ocf_log info "local node synced with the cluster" - # when sync is finished, we are ready to switch to Master - clear_sync_needed - set_master_score - return $OCF_SUCCESS - else - ocf_log info "local node syncing" - return $OCF_SUCCESS + if [ "$state" = "4" -a "$ready" = "ON" ]; then + ocf_log info "local node synced with the cluster" + # when sync is finished, we are ready to switch to Master + clear_sync_needed + set_master_score + return $OCF_SUCCESS + fi fi + + # if we pass here, an IST or SST is still in progress + ocf_log info "local node syncing" + return $OCF_SUCCESS } is_primary() { cluster_status=$(get_status_variable "wsrep_cluster_status") if [ "$cluster_status" = "Primary" ]; then return 0 fi if [ -z "$cluster_status" ]; then ocf_exit_reason "Unable to retrieve wsrep_cluster_status, verify check_user '$OCF_RESKEY_check_user' has permissions to view status" else ocf_log info "Galera instance wsrep_cluster_status=${cluster_status}" fi return 1 } is_readonly() { local res=$(get_option_variable "read_only") if ! ocf_is_true "$res"; then return 1 fi cluster_status=$(get_status_variable "wsrep_cluster_status") if ! [ "$cluster_status" = "Disconnected" ]; then return 1 fi return 0 } master_exists() { if [ "$__OCF_ACTION" = "demote" ]; then # We don't want to detect master instances during demote. # 1. we could be detecting ourselves as being master, which is no longer the case. # 2. we could be detecting other master instances that are in the process of shutting down. # by not detecting other master instances in "demote" we are deferring this check # to the next recurring monitor operation which will be much more accurate return 1 fi # determine if a master instance is already up and is healthy crm_mon --as-xml | grep "resource.*id=\"${OCF_RESOURCE_INSTANCE}\".*role=\"Master\".*active=\"true\".*orphaned=\"false\".*failed=\"false\"" > /dev/null 2>&1 return $? } clear_master_score() { local node=$1 if [ -z "$node" ]; then $CRM_MASTER -D else $CRM_MASTER -D -N $node fi } set_master_score() { local node=$1 if [ -z "$node" ]; then $CRM_MASTER -v 100 else $CRM_MASTER -N $node -v 100 fi } greater_than_equal_long() { # there are values we need to compare in this script # that are too large for shell -gt to process echo | awk -v n1="$1" -v n2="$2" '{if (n1>=n2) printf ("true"); else printf ("false");}' | grep -q "true" } detect_first_master() { local best_commit=0 local best_node="$NODENAME" local last_commit=0 local missing_nodes=0 + local nodes="" + local nodes_recovered="" + # avoid selecting a recovered node as bootstrap if possible for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do + if is_no_grastate $node; then + nodes_recovered="$nodes_recovered $node" + else + nodes="$nodes $node" + fi + done + + for node in $nodes_recovered $nodes; do last_commit=$(get_last_commit $node) if [ -z "$last_commit" ]; then ocf_log info "Waiting on node <${node}> to report database status before Master instances can start." missing_nodes=1 continue fi # this means -1, or that no commit has occured yet. if [ "$last_commit" = "18446744073709551615" ]; then last_commit="0" fi greater_than_equal_long "$last_commit" "$best_commit" if [ $? -eq 0 ]; then best_node=$node best_commit=$last_commit fi done if [ $missing_nodes -eq 1 ]; then return fi ocf_log info "Promoting $best_node to be our bootstrap node" set_master_score $best_node set_bootstrap_node $best_node } +detect_galera_pid() +{ + ps auxww | grep -v -e "${OCF_RESKEY_binary}" -e grep | grep -qe "--pid-file=$OCF_RESKEY_pid" +} + +galera_status() +{ + local loglevel=$1 + local rc + local running + + if [ -e $OCF_RESKEY_pid ]; then + mysql_common_status $loglevel + rc=$? + else + # if pidfile is not created, the server may + # still be starting up, e.g. running SST + detect_galera_pid + running=$? + if [ $running -eq 0 ]; then + rc=$OCF_SUCCESS + else + ocf_log $loglevel "MySQL is not running" + rc=$OCF_NOT_RUNNING + fi + fi + + return $rc +} + +galera_start_nowait() +{ + local mysql_extra_params="$1" + local pid + local running + + ${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \ + --pid-file=$OCF_RESKEY_pid \ + --socket=$OCF_RESKEY_socket \ + --datadir=$OCF_RESKEY_datadir \ + --log-error=$OCF_RESKEY_log \ + --user=$OCF_RESKEY_user $OCF_RESKEY_additional_parameters \ + $mysql_extra_params >/dev/null 2>&1 & + pid=$! + + # Spin waiting for the server to be spawned. + # Let the CRM/LRM time us out if required. + start_wait=1 + while [ $start_wait = 1 ]; do + if ! ps $pid > /dev/null 2>&1; then + wait $pid + ocf_exit_reason "MySQL server failed to start (pid=$pid) (rc=$?), please check your installation" + return $OCF_ERR_GENERIC + fi + detect_galera_pid + running=$? + if [ $running -eq 0 ]; then + start_wait=0 + else + ocf_log info "MySQL is not running" + fi + sleep 2 + done + + return $OCF_SUCCESS +} + galera_start_local_node() { local rc local extra_opts local bootstrap bootstrap=$(is_bootstrap) master_exists if [ $? -eq 0 ]; then # join without bootstrapping ocf_log info "Node <${NODENAME}> is joining the cluster" extra_opts="--wsrep-cluster-address=${OCF_RESKEY_wsrep_cluster_address}" elif ocf_is_true $bootstrap; then ocf_log info "Node <${NODENAME}> is bootstrapping the cluster" extra_opts="--wsrep-cluster-address=gcomm://" else ocf_exit_reason "Failure, Attempted to join cluster of $OCF_RESOURCE_INSTANCE before master node has been detected." clear_last_commit return $OCF_ERR_GENERIC fi # clear last_commit before we start galera to make sure there # won't be discrepency between the cib and galera if this node # processes a few transactions and fails before we detect it clear_last_commit mysql_common_prepare_dirs - mysql_common_start "$extra_opts" - rc=$? - if [ $rc != $OCF_SUCCESS ]; then - return $rc - fi - mysql_common_status info - rc=$? + # At start time, if galera requires a SST rather than an IST, the + # mysql server's pidfile won't be available until SST finishes, + # which can be longer than the start timeout. So we only check + # bootstrap node extensively. Joiner nodes are monitored in the + # "monitor" op + if ocf_is_true $bootstrap; then + # start server and wait until it's up and running + mysql_common_start "$extra_opts" + rc=$? + if [ $rc != $OCF_SUCCESS ]; then + return $rc + fi - if [ $rc != $OCF_SUCCESS ]; then - ocf_exit_reason "Failed initial monitor action" - return $rc - fi + mysql_common_status info + rc=$? - is_readonly - if [ $? -eq 0 ]; then - ocf_exit_reason "Failure. Master instance started in read-only mode, check configuration." - return $OCF_ERR_GENERIC - fi + if [ $rc != $OCF_SUCCESS ]; then + ocf_exit_reason "Failed initial monitor action" + return $rc + fi - is_primary - if [ $? -ne 0 ]; then - ocf_exit_reason "Failure. Master instance started, but is not in Primary mode." - return $OCF_ERR_GENERIC - fi + is_readonly + if [ $? -eq 0 ]; then + ocf_exit_reason "Failure. Master instance started in read-only mode, check configuration." + return $OCF_ERR_GENERIC + fi + + is_primary + if [ $? -ne 0 ]; then + ocf_exit_reason "Failure. Master instance started, but is not in Primary mode." + return $OCF_ERR_GENERIC + fi - if ocf_is_true $bootstrap; then clear_bootstrap_node + # clear attribute no-grastate. if last shutdown was + # not clean, we cannot be extra-cautious by requesting a SST + # since this is the bootstrap node + clear_no_grastate else + # only start server, defer full checks to "monitor" op + galera_start_nowait "$extra_opts" + rc=$? + if [ $rc != $OCF_SUCCESS ]; then + return $rc + fi + set_sync_needed + # attribute no-grastate will be cleared once the joiner + # has finished syncing and is promoted to Master fi ocf_log info "Galera started" return $OCF_SUCCESS } +detect_last_commit() +{ + local last_commit + local recover_args="--defaults-file=$OCF_RESKEY_config \ + --pid-file=$OCF_RESKEY_pid \ + --socket=$OCF_RESKEY_socket \ + --datadir=$OCF_RESKEY_datadir \ + --user=$OCF_RESKEY_user" + local recovered_position_regex='s/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p' + + ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat" + last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')" + if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then + local tmp=$(mktemp) + local tmperr=$(mktemp) + + # if we pass here because grastate.dat doesn't exist, + # try not to bootstrap from this node if possible + if [ ! -f ${OCF_RESKEY_datadir}/grastate.dat ]; then + set_no_grastate + fi + + ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" + + ${OCF_RESKEY_binary} $recover_args --wsrep-recover > $tmp 2> $tmperr + + last_commit="$(cat $tmp | sed -n $recovered_position_regex)" + if [ -z "$last_commit" ]; then + # Galera uses InnoDB's 2pc transactions internally. If + # server was stopped in the middle of a replication, the + # recovery may find a "prepared" XA transaction in the + # redo log, and mysql won't recover automatically + + cat $tmperr | grep -q -E '\[ERROR\]\s+Found\s+[0-9]+\s+prepared\s+transactions!' 2>/dev/null + if [ $? -eq 0 ]; then + # we can only rollback the transaction, but that's OK + # since the DB will get resynchronized anyway + ocf_log warn "local node <${NODENAME}> was not shutdown properly. Rollback stuck transaction with --tc-heuristic-recover" + ${OCF_RESKEY_binary} $recover_args --wsrep-recover \ + --tc-heuristic-recover=rollback > $tmp 2>/dev/null + + last_commit="$(cat $tmp | sed -n $recovered_position_regex)" + if [ ! -z "$last_commit" ]; then + ocf_log warn "State recovered. force SST at next restart for full resynchronization" + rm -f ${OCF_RESKEY_datadir}/grastate.dat + # try not to bootstrap from this node if possible + set_no_grastate + fi + fi + fi + rm -f $tmp $tmperr + fi + + if [ ! -z "$last_commit" ]; then + ocf_log info "Last commit version found: $last_commit" + set_last_commit $last_commit + return $OCF_SUCCESS + else + ocf_exit_reason "Unable to detect last known write sequence number" + clear_last_commit + return $OCF_ERR_GENERIC + fi +} galera_promote() { local rc local extra_opts local bootstrap master_exists if [ $? -ne 0 ]; then # promoting the first master will bootstrap the cluster if is_bootstrap; then galera_start_local_node rc=$? return $rc else ocf_exit_reason "Attempted to start the cluster without being a bootstrap node." return $OCF_ERR_GENERIC fi else # promoting other masters only performs sanity checks # as the joining nodes were started during the "monitor" op if ! check_sync_needed; then + # sync is done, clear info about last startup + clear_no_grastate return $OCF_SUCCESS else ocf_exit_reason "Attempted to promote local node while sync was still needed." return $OCF_ERR_GENERIC fi fi } galera_demote() { mysql_common_stop rc=$? if [ $rc -ne $OCF_SUCCESS ] && [ $rc -ne $OCF_NOT_RUNNING ]; then ocf_exit_reason "Failed to stop Master galera instance during demotion to Master" return $rc fi # if this node was previously a bootstrap node, that is no longer the case. clear_bootstrap_node clear_last_commit clear_sync_needed + clear_no_grastate - # record last commit by "starting" galera. start is just detection of the last sequence number - galera_start + # record last commit for next promotion + detect_last_commit + rc=$? + return $rc } galera_start() { - local last_commit + local rc echo $OCF_RESKEY_wsrep_cluster_address | grep -q $NODENAME if [ $? -ne 0 ]; then ocf_exit_reason "local node <${NODENAME}> must be a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>to start this galera instance" return $OCF_ERR_CONFIGURED fi - mysql_common_status info + galera_status info if [ $? -ne $OCF_NOT_RUNNING ]; then ocf_exit_reason "master galera instance started outside of the cluster's control" return $OCF_ERR_GENERIC fi mysql_common_prepare_dirs - ocf_log info "attempting to detect last commit version by reading ${OCF_RESKEY_datadir}/grastate.dat" - last_commit="$(cat ${OCF_RESKEY_datadir}/grastate.dat | sed -n 's/^seqno.\s*\(.*\)\s*$/\1/p')" - if [ -z "$last_commit" ] || [ "$last_commit" = "-1" ]; then - ocf_log info "now attempting to detect last commit version using 'mysqld_safe --wsrep-recover'" - local tmp=$(mktemp) - ${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \ - --pid-file=$OCF_RESKEY_pid \ - --socket=$OCF_RESKEY_socket \ - --datadir=$OCF_RESKEY_datadir \ - --user=$OCF_RESKEY_user \ - --wsrep-recover > $tmp 2>&1 - - last_commit="$(cat $tmp | sed -n 's/.*WSREP\:\s*[R|r]ecovered\s*position.*\:\(.*\)\s*$/\1/p')" - rm -f $tmp - - if [ "$last_commit" = "-1" ]; then - last_commit="0" - fi - fi - - if [ -z "$last_commit" ]; then - ocf_exit_reason "Unable to detect last known write sequence number" - clear_last_commit - return $OCF_ERR_GENERIC + detect_last_commit + rc=$? + if [ $rc -ne $OCF_SUCCESS ]; then + return $rc fi - ocf_log info "Last commit version found: $last_commit" - - set_last_commit $last_commit master_exists if [ $? -eq 0 ]; then ocf_log info "Master instances are already up, local node will join in when started" else clear_master_score detect_first_master fi return $OCF_SUCCESS } galera_monitor() { local rc local status_loglevel="err" # Set loglevel to info during probe if ocf_is_probe; then status_loglevel="info" fi - mysql_common_status $status_loglevel + # Check whether mysql is running or about to start after sync + galera_status $status_loglevel rc=$? if [ $rc -eq $OCF_NOT_RUNNING ]; then last_commit=$(get_last_commit $NODENAME) if [ -n "$last_commit" ];then rc=$OCF_SUCCESS if ocf_is_probe; then # prevent state change during probe return $rc fi master_exists if [ $? -ne 0 ]; then detect_first_master else # a master instance exists and is healthy. # start this node and mark it as "pending sync" ocf_log info "cluster is running. start local node to join in" galera_start_local_node rc=$? fi fi return $rc elif [ $rc -ne $OCF_SUCCESS ]; then return $rc fi - # if we make it here, mysql is running. Check cluster status now. + # if we make it here, mysql is running or about to start after sync. + # Check cluster status now. echo $OCF_RESKEY_wsrep_cluster_address | grep -q $NODENAME if [ $? -ne 0 ]; then ocf_exit_reason "local node <${NODENAME}> is started, but is not a member of the wsrep_cluster_address <${OCF_RESKEY_wsrep_cluster_address}>" return $OCF_ERR_GENERIC fi - is_primary + check_sync_needed if [ $? -eq 0 ]; then - check_sync_needed - if [ $? -eq 0 ]; then - # galera running and sync is needed: slave state - if ocf_is_probe; then - # prevent state change during probe - rc=$OCF_SUCCESS - else - check_sync_status - rc=$? - fi + # galera running and sync is needed: slave state + if ocf_is_probe; then + # prevent state change during probe + rc=$OCF_SUCCESS + else + check_sync_status + rc=$? + fi + else + is_primary + if [ $? -ne 0 ]; then + ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state." + rc=$OCF_ERR_GENERIC else # galera running, no need to sync: master state and everything's clear rc=$OCF_RUNNING_MASTER if ocf_is_probe; then # restore master score during probe # if we detect this is a master instance set_master_score fi fi - else - ocf_exit_reason "local node <${NODENAME}> is started, but not in primary mode. Unknown state." - rc=$OCF_ERR_GENERIC fi return $rc } galera_stop() { local rc # make sure the process is stopped mysql_common_stop rc=$? clear_last_commit clear_master_score clear_bootstrap_node clear_sync_needed + clear_no_grastate return $rc } galera_validate() { if ! ocf_is_ms; then ocf_exit_reason "Galera must be configured as a multistate Master/Slave resource." return $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_wsrep_cluster_address" ]; then ocf_exit_reason "Galera must be configured with a wsrep_cluster_address value." return $OCF_ERR_CONFIGURED fi mysql_common_validate } case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage|help) usage exit $OCF_SUCCESS;; esac galera_validate rc=$? LSB_STATUS_STOPPED=3 if [ $rc -ne 0 ]; then case "$1" in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; *) exit $rc;; esac fi if [ -z "${OCF_RESKEY_check_passwd}" ]; then # This value is automatically sourced from /etc/sysconfig/checkcluster if available OCF_RESKEY_check_passwd=${MYSQL_PASSWORD} fi if [ -z "${OCF_RESKEY_check_user}" ]; then # This value is automatically sourced from /etc/sysconfig/checkcluster if available OCF_RESKEY_check_user=${MYSQL_USERNAME} fi : ${OCF_RESKEY_check_user="root"} MYSQL_OPTIONS_CHECK="-nNE --user=${OCF_RESKEY_check_user}" if [ -n "${OCF_RESKEY_check_passwd}" ]; then MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK --password=${OCF_RESKEY_check_passwd}" fi # This value is automatically sourced from /etc/sysconfig/checkcluster if available if [ -n "${MYSQL_HOST}" ]; then MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK -h ${MYSQL_HOST}" fi # This value is automatically sourced from /etc/sysconfig/checkcluster if available if [ -n "${MYSQL_PORT}" ]; then MYSQL_OPTIONS_CHECK="$MYSQL_OPTIONS_CHECK -P ${MYSQL_PORT}" fi # What kind of method was invoked? case "$1" in start) galera_start;; stop) galera_stop;; - status) mysql_common_status err;; + status) galera_status err;; monitor) galera_monitor;; promote) galera_promote;; demote) galera_demote;; validate-all) exit $OCF_SUCCESS;; *) usage exit $OCF_ERR_UNIMPLEMENTED;; esac # vi:sw=4:ts=4:et: diff --git a/heartbeat/garbd b/heartbeat/garbd new file mode 100755 index 000000000..950df76bb --- /dev/null +++ b/heartbeat/garbd @@ -0,0 +1,417 @@ +#!/bin/sh +# +# Copyright (c) 2015 Damien Ciabrini +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# + +## +# README. +# +# Resource agent for garbd, the Galera arbitrator +# +# You can use this agent if you run an even number of galera nodes, +# and you want an additional node to avoid split-brain situations. +# +# garbd requires that a Galera cluster is running, so make sure to +# add a proper ordering constraint to the cluster, e.g.: +# +# pcs constraint order galera-master then garbd +# +# If you add garbd to the cluster while Galera is not running, you +# might want to disable it before setting up ordering constraint, e.g.: +# +# pcs resource create garbd garbd \ +# wsrep_cluster_address=gcomm://node1:4567,node2:4567 \ +# meta target-role=stopped +# +# Use location constraints to avoid running galera and garbd on +# the same node, e.g.: +# +# pcs constraint colocation add garbd with galera-master -INFINITY +# pcs constraint location garbd prefers node3=INFINITY +# +## + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### +# Set default paramenter values + +OCF_RESKEY_binary_default="/usr/sbin/garbd" +OCF_RESKEY_log_default="/var/log/garbd.log" +OCF_RESKEY_pid_default="/var/run/garbd.pid" +OCF_RESKEY_user_default="mysql" +if [ "X${HOSTOS}" = "XOpenBSD" ];then + OCF_RESKEY_group_default="_mysql" +else + OCF_RESKEY_group_default="mysql" +fi + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +: ${OCF_RESKEY_log=${OCF_RESKEY_log_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} + +usage() { + cat < + + +1.0 + + +Resource script for managing Galera arbitrator. + +Manages a galera arbitrator instance + + + + +Location of the Galera arbitrator binary + +garbd server binary + + + + + +User running the garbd process + +garbd user + + + + + +Group running garbd (for logfile permissions) + +garbd group + + + + + +The logfile to be used for garbd. + +Galera arbitrator log file + + + + + +The pidfile to be used for garbd. + +Galera arbitrator pidfile + + + + + +Additional parameters which are passed to garbd on startup. + +Additional parameters to pass to garbd + + + + + +The galera cluster address. This takes the form of: +gcomm://node:port,node:port,node:port + +Unlike Galera servers, port is mandatory for garbd. + +Galera cluster address + + + + + +The group name of the Galera cluster to connect to. + +Galera cluster name + + + + + + + + + + + + + +END +} + + +garbd_start() +{ + local rc + local pid + local start_wait + local garbd_params + + garbd_status info + rc=$? + if [ $rc -eq $OCF_SUCCESS ]; then + ocf_exit_reason "garbd started outside of the cluster's control" + return $OCF_ERR_GENERIC; + fi + + touch $OCF_RESKEY_log + chown $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_log + chmod 0640 $OCF_RESKEY_log + [ -x /sbin/restorecon ] && /sbin/restorecon $OCF_RESKEY_log + + garbd_params="--address=${OCF_RESKEY_wsrep_cluster_address} \ + --group ${OCF_RESKEY_wsrep_cluster_name} \ + --log ${OCF_RESKEY_log}" + + if [ ! -z "${OCF_RESKEY_options}" ]; then + garbd_params="${garbd_params} --options=${OCF_RESKEY_options}" + fi + + # garbd has no parameter to run as a specific user, + # so we need to start it by our own means + pid=$(su - -s /bin/sh $OCF_RESKEY_user -c "${OCF_RESKEY_binary} ${garbd_params} >/dev/null 2>&1 & echo \$!") + + # garbd doesn't create a pidfile either, so we create our own + echo $pid > $OCF_RESKEY_pid + if [ $? -ne 0 ]; then + ocf_exit_reason "Cannot create pidfile for garbd at $OCF_RESKEY_pid (rc=$?), please check your installation" + return $OCF_ERR_GENERIC + fi + + # Spin waiting for garbd to connect to the cluster. + # Let the CRM/LRM time us out if required. + start_wait=1 + while [ $start_wait -eq 1 ]; do + garbd_monitor info + rc=$? + if [ $rc -eq $OCF_NOT_RUNNING ]; then + ocf_exit_reason "garbd failed to start (pid=$pid), check logs in ${OCF_RESKEY_log}" + return $OCF_ERR_GENERIC + elif [ $rc -eq $OCF_SUCCESS ]; then + start_wait=0 + fi + sleep 2 + done + + ocf_log info "garbd connected to cluster \"${OCF_RESKEY_wsrep_cluster_name}\"" + return $OCF_SUCCESS +} + +garbd_status() +{ + local loglevel=$1 + local rc + ocf_pidfile_status $OCF_RESKEY_pid + rc=$? + + if [ $rc -eq 0 ]; then + return $OCF_SUCCESS + elif [ $rc -eq 2 ]; then + return $OCF_NOT_RUNNING + else + # clean up if pidfile is stale + if [ $rc -eq 1 ]; then + ocf_log $loglevel "garbd not running: removing old PID file" + rm -f $OCF_RESKEY_pid + fi + return $OCF_ERR_GENERIC + fi +} + +garbd_monitor() +{ + local rc + local pid + local loglevel=$1 + + # Set loglevel to info during probe + if ocf_is_probe; then + loglevel="info" + fi + + garbd_status $loglevel + rc=$? + + # probe just wants to know if garbd is running or not + if [ ocf_is_probe -a $rc -ne $OCF_SUCCESS ]; then + rc=$OCF_NOT_RUNNING + fi + + # Consider garbd is working if it's connected to at least + # one node in the galera cluster. + # Note: a Galera node in Non-Primary state will be + # stopped by the galera RA. So we can assume that + # garbd will always be connected to the right partition + if [ $rc -eq $OCF_SUCCESS ]; then + pid=`cat $OCF_RESKEY_pid 2> /dev/null ` + netstat -tnp 2>/dev/null | grep -s -q "ESTABLISHED.*${pid}/" + if [ $? -ne 0 ]; then + ocf_log $loglevel "garbd disconnected from cluster \"${OCF_RESKEY_wsrep_cluster_name}\"" + rc=$OCF_ERR_GENERIC + fi + fi + + return $rc +} + +garbd_stop() +{ + local rc + local pid + + if [ ! -f $OCF_RESKEY_pid ]; then + ocf_log info "garbd is not running" + return $OCF_SUCCESS + fi + + pid=`cat $OCF_RESKEY_pid 2> /dev/null ` + + ocf_log info "stopping garbd" + + # make sure the process is stopped + ocf_stop_processes TERM 10 $pid + rc=$? + + if [ $rc -ne 0 ]; then + return $OCF_ERR_GENERIC + else + rm -f $OCF_RESKEY_pid + ocf_log info "garbd stopped" + return $OCF_SUCCESS + fi +} + +garbd_validate() +{ + if ! have_binary "$OCF_RESKEY_binary"; then + ocf_exit_reason "Setup problem: couldn't find command: $OCF_RESKEY_binary" + return $OCF_ERR_INSTALLED; + fi + + if ! have_binary "netstat"; then + ocf_exit_reason "Setup problem: couldn't find command: netstat" + return $OCF_ERR_INSTALLED; + fi + + if [ -z "$OCF_RESKEY_wsrep_cluster_address" ]; then + ocf_exit_reason "garbd must be configured with a wsrep_cluster_address value." + return $OCF_ERR_CONFIGURED + fi + + # unlike galera RA, ports must be set in cluster address for garbd + # https://github.com/codership/galera/issues/98 + for node in $(echo "$OCF_RESKEY_wsrep_cluster_address" | sed 's/gcomm:\/\///g' | tr -d ' ' | tr -s ',' ' '); do + echo $node | grep -s -q ':[1-9][0-9]*$' + if [ $? -ne 0 ]; then + ocf_exit_reason "wsrep_cluster_address must specify ports (gcomm://node1:port,node2:port)." + return $OCF_ERR_CONFIGURED + fi + done + + # Ensure that the encryption method is set if garbd is configured + # to use SSL. + echo $OCF_RESKEY_options | grep -s -q -i -E '\bsocket.ssl_(key|cert)=' + if [ $? -eq 0 ]; then + echo $OCF_RESKEY_options | grep -s -q -i -E '\bsocket.ssl_cipher=' + if [ $? -ne 0 ]; then + ocf_exit_reason "option socket.ssl_cipher must be set if SSL is enabled." + return $OCF_ERR_CONFIGURED + fi + fi + + if [ -z "$OCF_RESKEY_wsrep_cluster_name" ]; then + ocf_exit_reason "garbd must be configured with a wsrep_cluster_name value." + return $OCF_ERR_CONFIGURED + fi + + if ! getent passwd $OCF_RESKEY_user >/dev/null 2>&1; then + ocf_exit_reason "User $OCF_RESKEY_user doesn't exist" + return $OCF_ERR_INSTALLED + fi + + if ! getent group $OCF_RESKEY_group >/dev/null 2>&1; then + ocf_exit_reason "Group $OCF_RESKEY_group doesn't exist" + return $OCF_ERR_INSTALLED + fi + + return $OCF_SUCCESS +} + +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage|help) usage + exit $OCF_SUCCESS;; +esac + +garbd_validate +rc=$? + +# trap configuration errors early, but don't block stop in such cases +LSB_STATUS_STOPPED=3 +if [ $rc -ne 0 ]; then + case "$1" in + stop) exit $OCF_SUCCESS;; + status) exit $LSB_STATUS_STOPPED;; + *) exit $rc;; + esac +fi + +# What kind of method was invoked? +case "$1" in + start) garbd_start;; + stop) garbd_stop;; + status) garbd_status err;; + monitor) garbd_monitor err;; + promote) garbd_promote;; + demote) garbd_demote;; + validate-all) exit $OCF_SUCCESS;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac diff --git a/heartbeat/iSCSILogicalUnit b/heartbeat/iSCSILogicalUnit index 0b1670f30..c1bf11dbb 100755 --- a/heartbeat/iSCSILogicalUnit +++ b/heartbeat/iSCSILogicalUnit @@ -1,677 +1,690 @@ #!/bin/bash # # # iSCSILogicalUnit OCF RA. Exports and manages iSCSI Logical Units. # # (c) 2013 LINBIT, Lars Ellenberg # (c) 2009-2010 Florian Haas, Dejan Muhamedagic, # and Linux-HA contributors # # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults # Set a default implementation based on software installed if have_binary ietadm; then OCF_RESKEY_implementation_default="iet" elif have_binary tgtadm; then OCF_RESKEY_implementation_default="tgt" elif have_binary lio_node; then OCF_RESKEY_implementation_default="lio" elif have_binary targetcli; then OCF_RESKEY_implementation_default="lio-t" fi : ${OCF_RESKEY_implementation=${OCF_RESKEY_implementation_default}} # Use a default SCSI ID and SCSI SN that is unique across the cluster, # and persistent in the event of resource migration. # SCSI IDs are limited to 24 bytes, but only 16 bytes are known to be # supported by all iSCSI implementations this RA cares about. Thus, # for a default, use the first 16 characters of # $OCF_RESOURCE_INSTANCE. OCF_RESKEY_scsi_id_default="${OCF_RESOURCE_INSTANCE:0:16}" : ${OCF_RESKEY_scsi_id=${OCF_RESKEY_scsi_id_default}} # To have a reasonably unique default SCSI SN, use the first 8 bytes # of an MD5 hash of of $OCF_RESOURCE_INSTANCE sn=`echo -n "${OCF_RESOURCE_INSTANCE}" | openssl md5 | sed -e 's/(stdin)= //'` OCF_RESKEY_scsi_sn_default=${sn:0:8} : ${OCF_RESKEY_scsi_sn=${OCF_RESKEY_scsi_sn_default}} # set 0 as a default value for lio iblock device number OCF_RESKEY_lio_iblock_default=0 OCF_RESKEY_lio_iblock=${OCF_RESKEY_lio_iblock:-$OCF_RESKEY_lio_iblock_default} ## tgt specifics # tgt has "backing store type" and "backing store open flags", # as well as device-type. # # suggestions how to make this generic accross all supported implementations? # how should they be named, how should they be mapped to implementation specifics? # # OCF_RESKEY_tgt_bstype # OCF_RESKEY_tgt_bsoflags +# OCF_RESKEY_tgt_bsopts # OCF_RESKEY_tgt_device_type ####################################################################### meta_data() { cat < 0.9 Manages iSCSI Logical Unit. An iSCSI Logical unit is a subdivision of an SCSI Target, exported via a daemon that speaks the iSCSI protocol. Manages iSCSI Logical Units (LUs) The iSCSI target daemon implementation. Must be one of "iet", "tgt", "lio", or "lio-t". If unspecified, an implementation is selected based on the availability of management utilities, with "iet" being tried first, then "tgt", then "lio", then "lio-t". iSCSI target daemon implementation The iSCSI Qualified Name (IQN) that this Logical Unit belongs to. iSCSI target IQN The Logical Unit number (LUN) exposed to initiators. Logical Unit number (LUN) The path to the block device exposed. Some implementations allow this to be a regular file, too. Block device (or file) path The SCSI ID to be configured for this Logical Unit. The default is the resource name, truncated to 24 bytes. SCSI ID The SCSI serial number to be configured for this Logical Unit. The default is a hash of the resource name, truncated to 8 bytes. SCSI serial number The SCSI vendor ID to be configured for this Logical Unit. SCSI vendor ID The SCSI product ID to be configured for this Logical Unit. SCSI product ID TGT specific backing store type. If you want to use aio, make sure your tgtadm is built against libaio. See tgtadm(8). TGT backing store type TGT specific backing store open flags (direct|sync). See tgtadm(8). TGT backing store open flags + + +TGT specific backing store options. +See tgtadm(8). + +TGT backing store options + + + TGT specific device type. See tgtadm(8). TGT device type Additional LU parameters. A space-separated list of "name=value" pairs which will be passed through to the iSCSI daemon's management interface. The supported parameters are implementation dependent. Neither the name nor the value may contain whitespace. List of iSCSI LU parameters Allowed initiators. A space-separated list of initiators allowed to connect to this lun. Initiators may be listed in any syntax the target implementation allows. If this parameter is empty or not set, access to this lun will not be allowed from any initiator, if target is not in demo mode. This parameter is only necessary when using LIO. List of iSCSI initiators allowed to connect to this lun. LIO iblock device name, a number starting from 0. Using distinct values here avoids a warning in LIO "LEGACY: SHARED HBA"; and it is necessary when using multiple LUNs started at the same time (eg. on node failover) to prevent a race condition in tcm_core on mkdir() in /sys/kernel/config/target/core/. LIO iblock device number END } ####################################################################### iSCSILogicalUnit_usage() { cat < /sys/kernel/config/target/core/iblock_${OCF_RESKEY_lio_iblock}/${OCF_RESOURCE_INSTANCE}/wwn/vpd_unit_serial fi ocf_run targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/luns create /backstores/block/${OCF_RESOURCE_INSTANCE} ${OCF_RESKEY_lun} || exit $OCF_ERR_GENERIC if [ -n "${OCF_RESKEY_allowed_initiators}" ]; then for initiator in ${OCF_RESKEY_allowed_initiators}; do ocf_run targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/acls create ${initiator} add_mapped_luns=False || exit $OCF_ERR_GENERIC ocf_run targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/acls/${initiator} create ${OCF_RESKEY_lun} ${OCF_RESKEY_lun} || exit $OCF_ERR_GENERIC done fi ;; esac # Force the monitor operation to pass before start is considered a success. iSCSILogicalUnit_monitor } iSCSILogicalUnit_stop() { iSCSILogicalUnit_monitor if [ $? -eq $OCF_NOT_RUNNING ]; then return $OCF_SUCCESS fi case $OCF_RESKEY_implementation in iet) # IET allows us to remove LUs while they are in use ocf_run ietadm --op delete \ --tid=${TID} \ --lun=${OCF_RESKEY_lun} || exit $OCF_ERR_GENERIC ;; tgt) # tgt will fail to remove an LU while it is in use, # but at the same time does not allow us to # selectively shut down a connection that is using a # specific LU. Thus, we need to loop here until tgtd # decides that the LU is no longer in use, or we get # timed out by the LRM. while ! ocf_run -warn tgtadm --lld iscsi --op delete --mode logicalunit \ --tid ${TID} \ --lun=${OCF_RESKEY_lun}; do sleep 1 done ;; lio) acls_configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_target_iqn}/tpgt_1/acls" for initiatorpath in ${acls_configfs_path}/*; do initiator=$(basename "${initiatorpath}") if [ -e "${initiatorpath}/lun_${OCF_RESKEY_lun}" ]; then ocf_log info "deleting acl at ${initiatorpath}/lun_${OCF_RESKEY_lun}" ocf_run lio_node --dellunacl=${OCF_RESKEY_target_iqn} 1 \ ${initiator} ${OCF_RESKEY_lun} || exit $OCF_ERR_GENERIC fi done lun_configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_target_iqn}/tpgt_1/lun/lun_${OCF_RESKEY_lun}/" if [ -e "${lun_configfs_path}" ]; then ocf_run lio_node --dellun=${OCF_RESKEY_target_iqn} 1 ${OCF_RESKEY_lun} || exit $OCF_ERR_GENERIC fi block_configfs_path="/sys/kernel/config/target/core/iblock_${OCF_RESKEY_lio_iblock}/${OCF_RESOURCE_INSTANCE}/udev_path" if [ -e "${block_configfs_path}" ]; then ocf_run tcm_node --freedev=iblock_${OCF_RESKEY_lio_iblock}/${OCF_RESOURCE_INSTANCE} || exit $OCF_ERR_GENERIC fi ;; lio-t) # "targetcli delete" will fail if the LUN is already # gone. Log a warning and still push ahead. ocf_run -warn targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/luns delete ${OCF_RESKEY_lun} if [ -n "${OCF_RESKEY_allowed_initiators}" ]; then for initiator in ${OCF_RESKEY_allowed_initiators}; do if targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/acls/${initiator} status | grep "Mapped LUNs: 0" >/dev/null ; then ocf_run -warn targetcli /iscsi/${OCF_RESKEY_target_iqn}/tpg1/acls/ delete ${initiator} fi done fi # If we've proceeded down to here and we're unable to # delete the backstore, then something is seriously # wrong and we need to fail the stop operation # (potentially causing fencing) ocf_run targetcli /backstores/block delete ${OCF_RESOURCE_INSTANCE} || exit $OCF_ERR_GENERIC ;; esac return $OCF_SUCCESS } iSCSILogicalUnit_monitor() { - # If our backing device (or file) doesn't even exist, we're not running - [ -e ${OCF_RESKEY_path} ] || return $OCF_NOT_RUNNING + if [ x"${OCF_RESKEY_tgt_bstype}" != x"rbd" ]; then + # If our backing device (or file) doesn't even exist, we're not running + [ -e ${OCF_RESKEY_path} ] || return $OCF_NOT_RUNNING + fi case $OCF_RESKEY_implementation in iet) # Figure out and set the target ID TID=`sed -ne "s/tid:\([[:digit:]]\+\) name:${OCF_RESKEY_target_iqn}$/\1/p" < /proc/net/iet/volume` if [ -z "${TID}" ]; then # Our target is not configured, thus we're not # running. return $OCF_NOT_RUNNING fi # FIXME: this looks for a matching LUN and path, but does # not actually test for the correct target ID. grep -E -q "[[:space:]]+lun:${OCF_RESKEY_lun}.*path:${OCF_RESKEY_path}$" /proc/net/iet/volume && return $OCF_SUCCESS ;; tgt) # Figure out and set the target ID TID=`tgtadm --lld iscsi --op show --mode target \ | sed -ne "s/^Target \([[:digit:]]\+\): ${OCF_RESKEY_target_iqn}$/\1/p"` if [ -z "$TID" ]; then # Our target is not configured, thus we're not # running. return $OCF_NOT_RUNNING fi # This only looks for the backing store, but does not test # for the correct target ID and LUN. tgtadm --lld iscsi --op show --mode target \ | grep -E -q "[[:space:]]+Backing store.*: ${OCF_RESKEY_path}$" && return $OCF_SUCCESS ;; lio) configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_target_iqn}/tpgt_1/lun/lun_${OCF_RESKEY_lun}/${OCF_RESOURCE_INSTANCE}/udev_path" [ -e ${configfs_path} ] && [ `cat ${configfs_path}` = "${OCF_RESKEY_path}" ] && return $OCF_SUCCESS # if we aren't activated, is a block device still left over? block_configfs_path="/sys/kernel/config/target/core/iblock_${OCF_RESKEY_lio_iblock}/${OCF_RESOURCE_INSTANCE}/udev_path" [ -e ${block_configfs_path} ] && ocf_log warn "existing block without an active lun: ${block_configfs_path}" [ -e ${block_configfs_path} ] && return $OCF_ERR_GENERIC ;; lio-t) configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_target_iqn}/tpgt_1/lun/lun_${OCF_RESKEY_lun}/*/udev_path" [ -e ${configfs_path} ] && [ `cat ${configfs_path}` = "${OCF_RESKEY_path}" ] && return $OCF_SUCCESS # if we aren't activated, is a block device still left over? block_configfs_path="/sys/kernel/config/target/core/iblock_*/${OCF_RESOURCE_INSTANCE}/udev_path" [ -e ${block_configfs_path} ] && ocf_log warn "existing block without an active lun: ${block_configfs_path}" [ -e ${block_configfs_path} ] && return $OCF_ERR_GENERIC ;; esac return $OCF_NOT_RUNNING } iSCSILogicalUnit_validate() { # Do we have all required variables? for var in target_iqn lun path; do param="OCF_RESKEY_${var}" if [ -z "${!param}" ]; then ocf_exit_reason "Missing resource parameter \"$var\"!" exit $OCF_ERR_CONFIGURED fi done # Is the configured implementation supported? case "$OCF_RESKEY_implementation" in "iet"|"tgt"|"lio"|"lio-t") ;; "") # The user didn't specify an implementation, and we were # unable to determine one from installed binaries (in # other words: no binaries for any supported # implementation could be found) ocf_exit_reason "Undefined iSCSI target implementation" exit $OCF_ERR_INSTALLED ;; *) ocf_exit_reason "Unsupported iSCSI target implementation \"$OCF_RESKEY_implementation\"!" exit $OCF_ERR_CONFIGURED ;; esac # Do we have a valid LUN? case $OCF_RESKEY_implementation in iet) # IET allows LUN 0 and up [ $OCF_RESKEY_lun -ge 0 ] case $? in 0) # OK ;; 1) ocf_log err "Invalid LUN $OCF_RESKEY_lun (must be a non-negative integer)." exit $OCF_ERR_CONFIGURED ;; *) ocf_log err "Invalid LUN $OCF_RESKEY_lun (must be an integer)." exit $OCF_ERR_CONFIGURED ;; esac ;; tgt) # tgt reserves LUN 0 for its own purposes [ $OCF_RESKEY_lun -ge 1 ] case $? in 0) # OK ;; 1) ocf_log err "Invalid LUN $OCF_RESKEY_lun (must be greater than 0)." exit $OCF_ERR_CONFIGURED ;; *) ocf_log err "Invalid LUN $OCF_RESKEY_lun (must be an integer)." exit $OCF_ERR_CONFIGURED ;; esac ;; esac # Do we have any configuration parameters that the current # implementation does not support? local unsupported_params local var local envar case $OCF_RESKEY_implementation in iet) # IET does not support setting the vendor and product ID # (it always uses "IET" and "VIRTUAL-DISK") - unsupported_params="vendor_id product_id allowed_initiators lio_iblock tgt_bstype tgt_bsoflags tgt_device_type" + unsupported_params="vendor_id product_id allowed_initiators lio_iblock tgt_bstype tgt_bsoflags tgt_bsopts tgt_device_type" ;; tgt) unsupported_params="allowed_initiators lio_iblock" ;; lio) - unsupported_params="scsi_id vendor_id product_id tgt_bstype tgt_bsoflags tgt_device_type" + unsupported_params="scsi_id vendor_id product_id tgt_bstype tgt_bsoflags tgt_bsopts tgt_device_type" ;; lio-t) - unsupported_params="scsi_id vendor_id product_id tgt_bstype tgt_bsoflags tgt_device_type lio_iblock" + unsupported_params="scsi_id vendor_id product_id tgt_bstype tgt_bsoflags tgt_bsopts tgt_device_type lio_iblock" ;; esac for var in ${unsupported_params}; do envar=OCF_RESKEY_${var} defvar=OCF_RESKEY_${var}_default if [ -n "${!envar}" ]; then if [[ "${!envar}" != "${!defvar}" ]];then case "$__OCF_ACTION" in start|validate-all) ocf_log warn "Configuration parameter \"${var}\"" \ "is not supported by the iSCSI implementation" \ "and will be ignored." ;; esac fi fi done if ! ocf_is_probe; then # Do we have all required binaries? case $OCF_RESKEY_implementation in iet) check_binary ietadm ;; tgt) check_binary tgtadm ;; lio) check_binary tcm_node check_binary lio_node ;; lio-t) check_binary targetcli ;; esac # Is the required kernel functionality available? case $OCF_RESKEY_implementation in iet) [ -d /proc/net/iet ] if [ $? -ne 0 ]; then ocf_log err "/proc/net/iet does not exist or is not a directory -- check if required modules are loaded." exit $OCF_ERR_INSTALLED fi ;; tgt) # tgt is userland only ;; esac fi return $OCF_SUCCESS } case $1 in meta-data) meta_data exit $OCF_SUCCESS ;; usage|help) iSCSILogicalUnit_usage exit $OCF_SUCCESS ;; esac # Everything except usage and meta-data must pass the validate test iSCSILogicalUnit_validate case $__OCF_ACTION in start) iSCSILogicalUnit_start;; stop) iSCSILogicalUnit_stop;; monitor|status) iSCSILogicalUnit_monitor;; reload) ocf_log err "Reloading..." iSCSILogicalUnit_start ;; validate-all) ;; *) iSCSILogicalUnit_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac rc=$? ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" exit $rc diff --git a/heartbeat/iSCSITarget b/heartbeat/iSCSITarget index b71a21f29..08832cd64 100755 --- a/heartbeat/iSCSITarget +++ b/heartbeat/iSCSITarget @@ -1,663 +1,683 @@ #!/bin/bash # # # iSCSITarget OCF RA. Exports and manages iSCSI targets. # # (c) 2009-2010 Florian Haas, Dejan Muhamedagic, # and Linux-HA contributors # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults # Set a default implementation based on software installed if have_binary ietadm; then OCF_RESKEY_implementation_default="iet" elif have_binary tgtadm; then OCF_RESKEY_implementation_default="tgt" elif have_binary lio_node; then OCF_RESKEY_implementation_default="lio" elif have_binary targetcli; then OCF_RESKEY_implementation_default="lio-t" fi : ${OCF_RESKEY_implementation=${OCF_RESKEY_implementation_default}} # Listen on 0.0.0.0:3260 by default OCF_RESKEY_portals_default="0.0.0.0:3260" : ${OCF_RESKEY_portals=${OCF_RESKEY_portals_default}} # Lockfile, used for selecting a target ID LOCKFILE=${HA_RSCTMP}/iSCSITarget-${OCF_RESKEY_implementation}.lock ####################################################################### meta_data() { cat < 0.9 Manages iSCSI targets. An iSCSI target is a collection of SCSI Logical Units (LUs) exported via a daemon that speaks the iSCSI protocol. iSCSI target export agent The iSCSI target daemon implementation. Must be one of "iet", "tgt", "lio", or "lio-t". If unspecified, an implementation is selected based on the availability of management utilities, with "iet" being tried first, then "tgt", then "lio", then "lio-t". Specifies the iSCSI target implementation ("iet", "tgt", "lio", or "lio-t"). The target iSCSI Qualified Name (IQN). Should follow the conventional "iqn.yyyy-mm.<reversed domain name>[:identifier]" syntax. iSCSI target IQN The iSCSI target ID. Required for tgt. iSCSI target ID iSCSI network portal addresses. Not supported by all implementations. If unset, the default is to create one portal that listens on ${OCF_RESKEY_portal_default}. iSCSI portal addresses + + +iSCSI iSER network portal addresses. Not supported by all +implementations. + +iSCSI iSER enabled portal addresses + + + Allowed initiators. A space-separated list of initiators allowed to connect to this target. Initiators may be listed in any syntax the target implementation allows. If this parameter is empty or not set, access to this target will be allowed from any initiator. List of iSCSI initiators allowed to connect to this target A username used for incoming initiator authentication. If unspecified, allowed initiators will be able to log in without authentication. This is a unique parameter, as it not allowed to re-use a single username across multiple target instances. Incoming account username A password used for incoming initiator authentication. Incoming account password Additional target parameters. A space-separated list of "name=value" pairs which will be passed through to the iSCSI daemon's management interface. The supported parameters are implementation dependent. Neither the name nor the value may contain whitespace. List of iSCSI target parameters END } ####################################################################### iSCSITarget_usage() { cat <> /etc/initiators.deny echo "${OCF_RESKEY_iqn} ${OCF_RESKEY_allowed_initiators// /,}" >> /etc/initiators.allow else echo "${OCF_RESKEY_iqn} ALL" >> /etc/initiators.allow fi # In iet, adding a new user and assigning it to a target # is one operation. if [ -n "${OCF_RESKEY_incoming_username}" ]; then ocf_run ietadm --op new --user \ --tid=${tid} \ --params=IncomingUser=${OCF_RESKEY_incoming_username},Password=${OCF_RESKEY_incoming_password} \ || exit $OCF_ERR_GENERIC fi ;; tgt) local tid tid="${OCF_RESKEY_tid}" # Create the target. ocf_run tgtadm --lld iscsi --op new --mode target \ --tid=${tid} \ --targetname ${OCF_RESKEY_iqn} || exit $OCF_ERR_GENERIC # Set parameters. for param in ${OCF_RESKEY_additional_parameters}; do name=${param%=*} value=${param#*=} ocf_run tgtadm --lld iscsi --op update --mode target \ --tid=${tid} \ --name=${name} --value=${value} || exit $OCF_ERR_GENERIC done # For tgt, we always have to add access per initiator; # access to targets is denied by default. If # "allowed_initiators" is unset, we must use the special # keyword ALL. for initiator in ${OCF_RESKEY_allowed_initiators=ALL}; do ocf_run tgtadm --lld iscsi --op bind --mode target \ --tid=${tid} \ --initiator-address=${initiator} || exit $OCF_ERR_GENERIC done # In tgt, we must first create a user account, then assign # it to a target using the "bind" operation. if [ -n "${OCF_RESKEY_incoming_username}" ]; then ocf_run tgtadm --lld iscsi --mode account --op new \ --user=${OCF_RESKEY_incoming_username} \ --password=${OCF_RESKEY_incoming_password} || exit $OCF_ERR_GENERIC ocf_run tgtadm --lld iscsi --mode account --op bind \ --tid=${tid} \ --user=${OCF_RESKEY_incoming_username} || exit $OCF_ERR_GENERIC fi ;; lio) # lio distinguishes between targets and target portal # groups (TPGs). We will always create one TPG, with the # number 1. In lio, creating a network portal # automatically creates the corresponding target if it # doesn't already exist. for portal in ${OCF_RESKEY_portals}; do ocf_run lio_node --addnp ${OCF_RESKEY_iqn} 1 \ ${portal} || exit $OCF_ERR_GENERIC done # in lio, we can set target parameters by manipulating # the appropriate configfs entries for param in ${OCF_RESKEY_additional_parameters}; do name=${param%=*} value=${param#*=} configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn}/tpgt_1/param/${name}" if [ -e ${configfs_path} ]; then echo ${value} > ${configfs_path} || exit $OCF_ERR_GENERIC else ocf_log warn "Unsupported iSCSI target parameter ${name}: will be ignored." fi done # lio does per-initiator filtering by default. To disable # this, we need to switch the target to "permissive mode". if [ -n "${OCF_RESKEY_allowed_initiators}" ]; then for initiator in ${OCF_RESKEY_allowed_initiators}; do ocf_run lio_node --addnodeacl ${OCF_RESKEY_iqn} 1 \ ${initiator} || exit $OCF_ERR_GENERIC done else ocf_run lio_node --permissive ${OCF_RESKEY_iqn} 1 || exit $OCF_ERR_GENERIC # permissive mode enables read-only access by default, # so we need to change that to RW to be in line with # the other implementations. echo 0 > "/sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn}/tpgt_1/attrib/demo_mode_write_protect" if [ `cat /sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn}/tpgt_1/attrib/demo_mode_write_protect` -ne 0 ]; then ocf_log err "Failed to disable write protection for target ${OCF_RESKEY_iqn}." exit $OCF_ERR_GENERIC fi fi # TODO: add CHAP authentication support when it gets added # back into LIO ocf_run lio_node --disableauth ${OCF_RESKEY_iqn} 1 || exit $OCF_ERR_GENERIC # Finally, we need to enable the target to allow # initiators to connect ocf_run lio_node --enabletpg=${OCF_RESKEY_iqn} 1 || exit $OCF_ERR_GENERIC ;; lio-t) # lio distinguishes between targets and target portal # groups (TPGs). We will always create one TPG, with the # number 1. In lio, creating a network portal # automatically creates the corresponding target if it # doesn't already exist. ocf_run targetcli /iscsi set global auto_add_default_portal=false || exit $OCF_ERR_GENERIC ocf_run targetcli /iscsi create ${OCF_RESKEY_iqn} || exit $OCF_ERR_GENERIC for portal in ${OCF_RESKEY_portals}; do if [ $portal != ${OCF_RESKEY_portals_default} ] ; then IFS=':' read -a sep_portal <<< "$portal" ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/portals create "${sep_portal[0]}" "${sep_portal[1]}" || exit $OCF_ERR_GENERIC else ocf_run targetcli /iscsi create ${OCF_RESKEY_iqn} || exit $OCF_ERR_GENERIC fi done # in lio, we can set target parameters by manipulating # the appropriate configfs entries for param in ${OCF_RESKEY_additional_parameters}; do name=${param%=*} value=${param#*=} configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn}/tpgt_1/param/${name}" if [ -e ${configfs_path} ]; then echo ${value} > ${configfs_path} || exit $OCF_ERR_GENERIC else ocf_log warn "Unsupported iSCSI target parameter ${name}: will be ignored." fi done + + # allow iSER enabled portal + for iser_portal in ${OCF_RESKEY_iser_portals}; do + configfs_path="/sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn}/tpgt_1/np/${iser_portal}\:*/iser" + if [ -f ${configfs_path} ]; then + echo "1" > ${configfs_path} || exit $OCF_ERR_GENERIC + else + ocf_log warn "Unable to set iSER on: $iser_portal" + fi + done + # lio does per-initiator filtering by default. To disable # this, we need to switch the target to "permissive mode". if [ -n "${OCF_RESKEY_allowed_initiators}" ]; then for initiator in ${OCF_RESKEY_allowed_initiators}; do ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/acls create ${initiator} || exit $OCF_ERR_GENERIC done else ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/ set attribute authentication=0 demo_mode_write_protect=0 generate_node_acls=1 cache_dynamic_acls=1 || exit $OCF_ERR_GENERIC fi # TODO: add CHAP authentication support when it gets added # back into LIO ocf_run targetcli /iscsi/${OCF_RESKEY_iqn}/tpg1/ set attribute authentication=0 || exit $OCF_ERR_GENERIC # ocf_run targetcli /iscsi ;; esac iSCSITarget_monitor } iSCSITarget_stop() { iSCSITarget_monitor if [ $? -eq $OCF_NOT_RUNNING ]; then return $OCF_SUCCESS fi local tid case $OCF_RESKEY_implementation in iet) # Figure out the target ID tid=`sed -ne "s/tid:\([[:digit:]]\+\) name:${OCF_RESKEY_iqn}/\1/p" < /proc/net/iet/volume` if [ -z "${tid}" ]; then ocf_log err "Failed to retrieve target ID for IQN ${OCF_RESKEY_iqn}" exit $OCF_ERR_GENERIC fi # Close existing connections. There is no other way to # do this in IET than to parse the contents of # /proc/net/iet/session. set -- $(sed -ne '/^tid:'${tid}' /,/^tid/ { /^[[:space:]]*sid:\([0-9]\+\)/ { s/^[[:space:]]*sid:\([0-9]*\).*/--sid=\1/; h; }; /^[[:space:]]*cid:\([0-9]\+\)/ { s/^[[:space:]]*cid:\([0-9]*\).*/--cid=\1/; G; p; }; }' < /proc/net/iet/session) while [[ -n $2 ]]; do # $2 $1 looks like "--sid=X --cid=Y" ocf_run ietadm --op delete \ --tid=${tid} $2 $1 shift 2 done # In iet, unassigning a user from a target and # deleting the user account is one operation. if [ -n "${OCF_RESKEY_incoming_username}" ]; then ocf_run ietadm --op delete --user \ --tid=${tid} \ --params=IncomingUser=${OCF_RESKEY_incoming_username} \ || exit $OCF_ERR_GENERIC fi # Loop on delete. Keep trying until we time out, if # necessary. while true; do if ietadm --op delete --tid=${tid}; then ocf_log debug "Removed target ${OCF_RESKEY_iqn}." break else ocf_log warn "Failed to remove target ${OCF_RESKEY_iqn}, retrying." sleep 1 fi done # Avoid stale /etc/initiators.{allow,deny} entries # for this target if [ -e /etc/initiators.deny ]; then ocf_run sed -e "/^${OCF_RESKEY_iqn}[[:space:]]/d" \ -i /etc/initiators.deny fi if [ -e /etc/initiators.allow ]; then ocf_run sed -e "/^${OCF_RESKEY_iqn}[[:space:]]/d" \ -i /etc/initiators.allow fi ;; tgt) tid="${OCF_RESKEY_tid}" # Close existing connections. There is no other way to # do this in tgt than to parse the output of "tgtadm --op # show". set -- $(tgtadm --lld iscsi --op show --mode target \ | sed -ne '/^Target '${tid}':/,/^Target/ { /^[[:space:]]*I_T nexus: \([0-9]\+\)/ { s/^.*: \([0-9]*\).*/--sid=\1/; h; }; /^[[:space:]]*Connection: \([0-9]\+\)/ { s/^.*: \([0-9]*\).*/--cid=\1/; G; p; }; /^[[:space:]]*LUN information:/ q; }') while [[ -n $2 ]]; do # $2 $1 looks like "--sid=X --cid=Y" ocf_run tgtadm --lld iscsi --op delete --mode connection \ --tid=${tid} $2 $1 shift 2 done # In tgt, we must first unbind the user account from # the target, then remove the account itself. if [ -n "${OCF_RESKEY_incoming_username}" ]; then ocf_run tgtadm --lld iscsi --mode account --op unbind \ --tid=${tid} \ --user=${OCF_RESKEY_incoming_username} || exit $OCF_ERR_GENERIC ocf_run tgtadm --lld iscsi --mode account --op delete \ --user=${OCF_RESKEY_incoming_username} || exit $OCF_ERR_GENERIC fi # Loop on delete. Keep trying until we time out, if # necessary. while true; do if tgtadm --lld iscsi --op delete --mode target --tid=${tid}; then ocf_log debug "Removed target ${OCF_RESKEY_iqn}." break else ocf_log warn "Failed to remove target ${OCF_RESKEY_iqn}, retrying." sleep 1 fi done # In tgt, we don't have to worry about our ACL # entries. They are automatically removed upon target # deletion. ;; lio) # In lio, removing a target automatically removes all # associated TPGs, network portals, and LUNs. ocf_run lio_node --deliqn ${OCF_RESKEY_iqn} || exit $OCF_ERR_GENERIC ;; lio-t) ocf_run targetcli /iscsi delete ${OCF_RESKEY_iqn} || exit $OCF_ERR_GENERIC ;; esac return $OCF_SUCCESS } iSCSITarget_monitor() { case $OCF_RESKEY_implementation in iet) grep -Eq "tid:[0-9]+ name:${OCF_RESKEY_iqn}" /proc/net/iet/volume && return $OCF_SUCCESS ;; tgt) tgtadm --lld iscsi --op show --mode target \ | grep -Eq "Target [0-9]+: ${OCF_RESKEY_iqn}" && return $OCF_SUCCESS ;; lio | lio-t) # if we have no configfs entry for the target, it's # definitely stopped [ -d /sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn} ] || return $OCF_NOT_RUNNING # if the target is there, but its TPG is not enabled, then # we also consider it stopped [ `cat /sys/kernel/config/target/iscsi/${OCF_RESKEY_iqn}/tpgt_1/enable` -eq 1 ] || return $OCF_NOT_RUNNING return $OCF_SUCCESS ;; esac return $OCF_NOT_RUNNING } iSCSITarget_validate() { # Do we have all required variables? local required_vars case $OCF_RESKEY_implementation in iet) required_vars="iqn" ;; tgt) required_vars="iqn tid" ;; esac for var in ${required_vars}; do param="OCF_RESKEY_${var}" if [ -z "${!param}" ]; then ocf_exit_reason "Missing resource parameter \"$var\"!" exit $OCF_ERR_CONFIGURED fi done # Is the configured implementation supported? case "$OCF_RESKEY_implementation" in "iet"|"tgt"|"lio"|"lio-t") ;; "") # The user didn't specify an implementation, and we were # unable to determine one from installed binaries (in # other words: no binaries for any supported # implementation could be found) ocf_exit_reason "Undefined iSCSI target implementation" exit $OCF_ERR_INSTALLED ;; *) ocf_exit_reason "Unsupported iSCSI target implementation \"$OCF_RESKEY_implementation\"!" exit $OCF_ERR_CONFIGURED ;; esac # Do we have any configuration parameters that the current # implementation does not support? local unsupported_params local var local envar case $OCF_RESKEY_implementation in iet|tgt) # IET and tgt do not support binding a target portal to a # specific IP address. unsupported_params="portals" ;; lio|lio-t) # TODO: Remove incoming_username and incoming_password # from this check when LIO 3.0 gets CHAP authentication unsupported_params="tid incoming_username incoming_password" ;; esac for var in ${unsupported_params}; do envar=OCF_RESKEY_${var} defvar=OCF_RESKEY_${var}_default if [ -n "${!envar}" ]; then if [[ "${!envar}" != "${!defvar}" ]];then case "$__OCF_ACTION" in start|validate-all) ocf_log warn "Configuration parameter \"${var}\"" \ "is not supported by the iSCSI implementation" \ "and will be ignored." ;; esac fi fi done if ! ocf_is_probe; then # Do we have all required binaries? case $OCF_RESKEY_implementation in iet) check_binary ietadm ;; tgt) check_binary tgtadm ;; lio) check_binary tcm_node check_binary lio_node ;; lio-t) check_binary targetcli ;; esac # Is the required kernel functionality available? case $OCF_RESKEY_implementation in iet) [ -d /proc/net/iet ] if [ $? -ne 0 ]; then ocf_log err "/proc/net/iet does not exist or is not a directory -- check if required modules are loaded." exit $OCF_ERR_INSTALLED fi ;; tgt) # tgt is userland only ;; lio) # lio needs configfs to be mounted if ! grep -Eq "^.*/sys/kernel/config[[:space:]]+configfs" /proc/mounts; then ocf_log err "configfs not mounted at /sys/kernel/config -- check if required modules are loaded." exit $OCF_ERR_INSTALLED fi # check for configfs entries created by target_core_mod if [ ! -d /sys/kernel/config/target ]; then ocf_log err "/sys/kernel/config/target does not exist or is not a directory -- check if required modules are loaded." exit $OCF_ERR_INSTALLED fi ;; lio-t) #targetcli loads the needed kernel modules ;; esac fi return $OCF_SUCCESS } case $1 in meta-data) meta_data exit $OCF_SUCCESS ;; usage|help) iSCSITarget_usage exit $OCF_SUCCESS ;; esac # Everything except usage and meta-data must pass the validate test iSCSITarget_validate case $__OCF_ACTION in start) iSCSITarget_start;; stop) iSCSITarget_stop;; monitor|status) iSCSITarget_monitor;; reload) ocf_log err "Reloading..." iSCSITarget_start ;; validate-all) ;; *) iSCSITarget_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac rc=$? ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" exit $rc diff --git a/heartbeat/iscsi b/heartbeat/iscsi index ef0236e47..81cd78eba 100755 --- a/heartbeat/iscsi +++ b/heartbeat/iscsi @@ -1,514 +1,516 @@ #!/bin/sh # # iSCSI OCF resource agent # Description: manage iSCSI disks (add/remove) using open-iscsi # # Copyright Dejan Muhamedagic # (C) 2007 Novell Inc. All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # See usage() and meta_data() below for more details... # # OCF instance parameters: # OCF_RESKEY_portal: the iSCSI portal address or host name (required) # OCF_RESKEY_target: the iSCSI target (required) # OCF_RESKEY_iscsiadm: iscsiadm program path (optional) # OCF_RESKEY_discovery_type: discovery type (optional; default: sendtargets) # OCF_RESKEY_try_recovery: wait for iSCSI recovery in monitor (optional; default: false) # # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults OCF_RESKEY_udev_default="yes" OCF_RESKEY_iscsiadm_default="iscsiadm" OCF_RESKEY_discovery_type_default="sendtargets" OCF_RESKEY_try_recovery_default="false" : ${OCF_RESKEY_udev=${OCF_RESKEY_udev_default}} : ${OCF_RESKEY_iscsiadm=${OCF_RESKEY_iscsiadm_default}} : ${OCF_RESKEY_discovery_type=${OCF_RESKEY_discovery_type_default}} usage() { methods=`iscsi_methods` methods=`echo $methods | tr ' ' '|'` cat < 1.0 OCF Resource Agent for iSCSI. Add (start) or remove (stop) iSCSI targets. Manages a local iSCSI initiator and its connections to iSCSI targets The iSCSI portal address in the form: {ip_address|hostname}[":"port] Portal address The iSCSI target IQN. Target IQN Target discovery type. Check the open-iscsi documentation for supported discovery types. Target discovery type open-iscsi administration utility binary. iscsiadm binary If the next resource depends on the udev creating a device then we wait until it is finished. On a normally loaded host this should be done quickly, but you may be unlucky. If you are not using udev set this to "no", otherwise we will spin in a loop until a timeout occurs. udev If the iSCSI session exists but is currently inactive/broken, which is most probably due to network problems, the iSCSI layer will try to recover. If this parameter is set to true, we'll wait for the recovery to succeed. In that case the monitor operation can only time out so you should set the monitor op timeout attribute appropriately. On error wait for iSCSI recovery in monitor EOF } iscsi_methods() { cat <= "2.0-872" changed discovery semantics # see http://www.mail-archive.com/open-iscsi@googlegroups.com/msg04883.html # there's a new discoverydb command which should be used instead discovery open_iscsi_discovery() { local output local discovery_variant="discovery" local options="" local cmd local version=`$iscsiadm --version | awk '{print $3}'` ocf_version_cmp "$version" "2.0-871" if [ $? -eq 2 ]; then # newer than 2.0-871? discovery_variant="discoverydb" [ "$discovery_type" = "sendtargets" ] && options="-D" fi cmd="$iscsiadm -m $discovery_variant -p $OCF_RESKEY_portal -t $discovery_type $options" output=`$cmd` if [ $? -ne 0 -o x = "x$output" ]; then [ x != "x$output" ] && { ocf_exit_reason "$cmd FAILED" echo "$output" } return 3 fi PORTAL=`echo "$output" | awk -v target="$OCF_RESKEY_target" ' $NF==target{ if( NF==3 ) portal=$2; # sles compat mode else portal=$1; sub(",.*","",portal); print portal; }'` case `echo "$PORTAL" | wc -w` in 0) #target not found echo "$output" ocf_exit_reason "target $OCF_RESKEY_target not found at portal $OCF_RESKEY_portal" return 1 ;; 1) #we're ok return 0 ;; *) # handle multihome hosts reporting multiple portals for p in $PORTAL; do if [ "$OCF_RESKEY_portal" = "$p" ]; then PORTAL="$OCF_RESKEY_portal" return 0 fi done echo "$output" ocf_exit_reason "sorry, can't handle multihomed hosts unless you specify the portal exactly" return 2 ;; esac } open_iscsi_add() { $iscsiadm -m node -p $1 -T $2 -l } open_iscsi_get_session_id() { local target="$1" + local portal="$2" $iscsiadm -m session 2>/dev/null | grep -E "$target($|[[:space:]])" | + grep -E "] $portal" | awk '{print $2}' | tr -d '[]' } open_iscsi_remove() { local target="$1" local session_id - session_id=`open_iscsi_get_session_id "$target"` + session_id=`open_iscsi_get_session_id "$target" "$OCF_RESKEY_portal"` if [ "$session_id" ]; then $iscsiadm -m session -r $session_id -u else ocf_exit_reason "cannot find session id for target $target" return 1 fi } # open_iscsi_monitor return codes: # 0: target running (logged in) # 1: target not running and target record exists # 2: iscsiadm -m session error (unexpected) # 3: target record does not exist (discovery necessary) # open_iscsi_monitor() { local target="$1" local session_id conn_state outp local prev_state local recov recov=${2:-$OCF_RESKEY_try_recovery} - session_id=`open_iscsi_get_session_id "$target"` + session_id=`open_iscsi_get_session_id "$target" "$OCF_RESKEY_portal"` prev_state="" if [ -z "$session_id" ]; then if $iscsiadm -m node -p $OCF_RESKEY_portal -T $target >/dev/null 2>&1; then return 1 # record found else return 3 fi fi while :; do outp=`$iscsiadm -m session -r $session_id -P 1` || return 2 conn_state=`echo "$outp" | sed -n '/Connection State/s/.*: //p'` # some drivers don't return connection state, in that case # we'll assume that we're still connected case "$conn_state" in "LOGGED IN") [ -n "$msg_logged" ] && ocf_log info "connection state $conn_state. Session restored." return 0;; "Unknown"|"") # this is also probably OK [ -n "$msg_logged" ] && ocf_log info "connection state $conn_state. Session restored." return 0;; *) # failed if [ "$__OCF_ACTION" != stop ] && ! ocf_is_probe && ocf_is_true $recov; then if [ "$conn_state" != "$prev_state" ]; then ocf_log warning "connection state $conn_state, waiting for recovery..." prev_state="$conn_state" fi sleep 1 else ocf_exit_reason "iscsiadm output: $outp" return 2 fi ;; esac done } disk_discovery() { discovery_type=${OCF_RESKEY_discovery_type} $discovery # discover and setup the real portal string (address) case $? in 0) ;; 1|2) exit $OCF_ERR_GENERIC ;; 3) if ! is_iscsid_running; then [ $setup_rc -eq 1 ] && ocf_log warning "iscsid.startup probably not correctly set in /etc/iscsi/iscsid.conf" exit $OCF_ERR_INSTALLED fi exit $OCF_ERR_GENERIC ;; esac } # # NB: this is udev specific! # wait_for_udev() { dev=/dev/disk/by-path/ip-$PORTAL-iscsi-$OCF_RESKEY_target while :; do ls $dev* >/dev/null 2>&1 && break ocf_log warning "waiting for udev to create $dev" sleep 1 done } iscsi_monitor() { $disk_status $OCF_RESKEY_target $* case $? in 0) return $OCF_SUCCESS;; 1|3) return $OCF_NOT_RUNNING;; 2) return $OCF_ERR_GENERIC;; esac } iscsi_start() { local rc $disk_status $OCF_RESKEY_target rc=$? if [ $rc -eq 3 ]; then disk_discovery $disk_status $OCF_RESKEY_target rc=$? fi case $rc in 0) ocf_log info "iscsi $PORTAL $OCF_RESKEY_target already running" return $OCF_SUCCESS ;; 1) $add_disk $PORTAL $OCF_RESKEY_target || return $OCF_ERR_GENERIC case "$OCF_RESKEY_udev" in [Yy]es) wait_for_udev || return $OCF_ERR_GENERIC ;; *) ;; esac ;; *) # the session exists, but it's broken ocf_log warning "iscsi $PORTAL $OCF_RESKEY_target in failed state" ;; esac iscsi_monitor 1 # enforce wait if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS else return $OCF_ERR_GENERIC fi } iscsi_stop() { iscsi_monitor if [ $? -ne $OCF_NOT_RUNNING ] ; then $remove_disk $OCF_RESKEY_target || return $OCF_ERR_GENERIC iscsi_monitor if [ $? -ne $OCF_NOT_RUNNING ] ; then return $OCF_ERR_GENERIC else return $OCF_SUCCESS fi else ocf_log info "iscsi $OCF_RESKEY_target already stopped" return $OCF_SUCCESS fi } # # 'main' starts here... # if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi # These operations don't require OCF instance parameters to be set case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage) usage exit $OCF_SUCCESS;; methods) iscsi_methods exit $OCF_SUCCESS;; esac if [ x = "x$OCF_RESKEY_target" ]; then ocf_exit_reason "target parameter not set" exit $OCF_ERR_CONFIGURED fi if [ x = "x$OCF_RESKEY_portal" ]; then ocf_exit_reason "portal parameter not set" exit $OCF_ERR_CONFIGURED fi case `uname` in Linux) setup=open_iscsi_setup ;; *) ocf_log info "platform `uname` may not be supported" setup=open_iscsi_setup ;; esac PORTAL="$OCF_RESKEY_portal" # updated by discovery LSB_STATUS_STOPPED=3 $setup setup_rc=$? if [ $setup_rc -gt 1 ]; then ocf_exit_reason "iscsi initiator utilities not installed or not setup" case "$1" in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; *) exit $OCF_ERR_INSTALLED;; esac fi if [ `id -u` != 0 ]; then ocf_exit_reason "$0 must be run as root" exit $OCF_ERR_PERM fi # which method was invoked? case "$1" in start) iscsi_start ;; stop) iscsi_stop ;; status) iscsi_monitor rc=$? case $rc in $OCF_SUCCESS) echo iscsi target $OCF_RESKEY_target running ;; $OCF_NOT_RUNNING) echo iscsi target $OCF_RESKEY_target stopped ;; *) echo iscsi target $OCF_RESKEY_target failed ;; esac exit $rc ;; monitor) iscsi_monitor ;; validate-all) # everything already validated # just exit successfully here. exit $OCF_SUCCESS;; *) iscsi_methods exit $OCF_ERR_UNIMPLEMENTED;; esac # # vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/mysql b/heartbeat/mysql index e2d54dd17..be914d3b2 100755 --- a/heartbeat/mysql +++ b/heartbeat/mysql @@ -1,1044 +1,1045 @@ #!/bin/sh # # # MySQL # # Description: Manages a MySQL database as Linux-HA resource # # Authors: Alan Robertson: DB2 Script # Jakub Janczak: rewrite as MySQL # Andrew Beekhof: cleanup and import # Sebastian Reitenbach: add OpenBSD defaults, more cleanup # Narayan Newton: add Gentoo/Debian defaults # Marian Marinov, Florian Haas: add replication capability # Yves Trudeau, Baron Schwartz: add VIP support and improve replication # # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # # (c) 2002-2005 International Business Machines, Inc. # 2005-2010 Linux-HA contributors # # An example usage in /etc/ha.d/haresources: # node1 10.0.0.170 mysql # # See usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_binary # OCF_RESKEY_client_binary # OCF_RESKEY_config # OCF_RESKEY_datadir # OCF_RESKEY_user # OCF_RESKEY_group # OCF_RESKEY_test_table # OCF_RESKEY_test_user # OCF_RESKEY_test_passwd # OCF_RESKEY_enable_creation # OCF_RESKEY_additional_parameters # OCF_RESKEY_log # OCF_RESKEY_pid # OCF_RESKEY_socket # OCF_RESKEY_replication_user # OCF_RESKEY_replication_passwd # OCF_RESKEY_replication_port # OCF_RESKEY_max_slave_lag # OCF_RESKEY_evict_outdated_slaves # OCF_RESKEY_reader_attribute ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs . ${OCF_FUNCTIONS_DIR}/mysql-common.sh ####################################################################### usage() { cat < 1.0 Resource script for MySQL. May manage a standalone MySQL database, a clone set with externally managed replication, or a complete master/slave replication setup. Note, when master/slave replication is in use, the resource must be setup to use notifications. Set 'notify=true' in the metadata attributes when defining a MySQL master/slave instance. While managing replication, the default behavior is to use uname -n values in the change master to command. Other IPs can be specified manually by adding a node attribute \${INSTANCE_ATTR_NAME}_mysql_master_IP giving the IP to use for replication. For example, if the mysql primitive you are using is p_mysql, the attribute to set will be p_mysql_mysql_master_IP. Manages a MySQL database instance Location of the MySQL server binary MySQL server binary Location of the MySQL client binary MySQL client binary Configuration file MySQL config Directory containing databases MySQL datadir User running MySQL daemon MySQL user Group running MySQL daemon (for logfile and directory permissions) MySQL group The logfile to be used for mysqld. MySQL log file The pidfile to be used for mysqld. MySQL pid file The socket to be used for mysqld. MySQL socket Table to be tested in monitor statement (in database.table notation) MySQL test table MySQL test user, must have select privilege on test_table MySQL test user MySQL test user password MySQL test user password If the MySQL database does not exist, it will be created Create the database if it does not exist Additional parameters which are passed to the mysqld on startup. (e.g. --skip-external-locking or --skip-grant-tables) Additional parameters to pass to mysqld MySQL replication user. This user is used for starting and stopping MySQL replication, for setting and resetting the master host, and for setting and unsetting read-only mode. Because of that, this user must have SUPER, REPLICATION SLAVE, REPLICATION CLIENT, PROCESS and RELOAD privileges on all nodes within the cluster. Mandatory if you define a master-slave resource. MySQL replication user MySQL replication password. Used for replication client and slave. Mandatory if you define a master-slave resource. MySQL replication user password The port on which the Master MySQL instance is listening. MySQL replication port The maximum number of seconds a replication slave is allowed to lag behind its master. Do not set this to zero. What the cluster manager does in case a slave exceeds this maximum lag is determined by the evict_outdated_slaves parameter. Maximum time (seconds) a MySQL slave is allowed to lag behind a master If set to true, any slave which is more than max_slave_lag seconds behind the master has its MySQL instance shut down. If this parameter is set to false in a primitive or clone resource, it is simply ignored. If set to false in a master/slave resource, then exceeding the maximum slave lag will merely push down the master preference so the lagging slave is never promoted to the new master. Determines whether to shut down badly lagging slaves An attribute that the RA can manage to specify whether a node can be read from. This node attribute will be 1 if it's fine to read from the node, and 0 otherwise (for example, when a slave has lagged too far behind the master). A typical example for the use of this attribute would be to tie a set of IP addresses to MySQL slaves that can be read from. This parameter is only meaningful in master/slave set configurations. Sets the node attribute that determines whether a node is usable for clients to read from. END } # Convenience functions set_read_only() { # Sets or unsets read-only mode. Accepts one boolean as its # optional argument. If invoked without any arguments, defaults to # enabling read only mode. Should only be set in master/slave # setups. # Returns $OCF_SUCCESS if the operation succeeds, or # $OCF_ERR_GENERIC if it fails. local ro_val if ocf_is_true $1; then ro_val="on" else ro_val="off" fi ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "SET GLOBAL read_only=${ro_val}" } get_read_only() { # Check if read-only is set local read_only_state read_only_state=`$MYSQL $MYSQL_OPTIONS_REPL \ -e "SHOW VARIABLES" | grep -w read_only | awk '{print $2}'` if [ "$read_only_state" = "ON" ]; then return 0 else return 1 fi } is_slave() { # Determine whether the machine is currently running as a MySQL # slave, as determined per SHOW SLAVE STATUS. Returns 1 if SHOW # SLAVE STATUS creates an empty result set, 0 otherwise. local rc local tmpfile # Check whether this machine should be slave if ! ocf_is_ms || ! get_read_only; then return 1 fi get_slave_info rc=$? rm -f $tmpfile if [ $rc -eq 0 ]; then # show slave status is not empty # Is there a master_log_file defined? (master_log_file is deleted # by reset slave if [ "$master_log_file" ]; then return 0 else return 1 fi else # "SHOW SLAVE STATUS" returns an empty set if instance is not a # replication slave return 1 fi } parse_slave_info() { # Extracts field $1 from result of "SHOW SLAVE STATUS\G" from file $2 sed -ne "s/^.* $1: \(.*\)$/\1/p" < $2 } get_slave_info() { # Warning: this sets $tmpfile and LEAVE this file! You must delete it after use! local mysql_options if [ "$master_log_file" -a "$master_host" ]; then # variables are already defined, get_slave_info has been run before return $OCF_SUCCESS else tmpfile=`mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX` $MYSQL $MYSQL_OPTIONS_REPL \ -e 'SHOW SLAVE STATUS\G' > $tmpfile if [ -s $tmpfile ]; then master_host=`parse_slave_info Master_Host $tmpfile` master_user=`parse_slave_info Master_User $tmpfile` master_port=`parse_slave_info Master_Port $tmpfile` master_log_file=`parse_slave_info Master_Log_File $tmpfile` master_log_pos=`parse_slave_info Read_Master_Log_Pos $tmpfile` slave_sql=`parse_slave_info Slave_SQL_Running $tmpfile` slave_io=`parse_slave_info Slave_IO_Running $tmpfile` last_errno=`parse_slave_info Last_Errno $tmpfile` secs_behind=`parse_slave_info Seconds_Behind_Master $tmpfile` ocf_log debug "MySQL instance running as a replication slave" else # Instance produced an empty "SHOW SLAVE STATUS" output -- # instance is not a slave ocf_exit_reason "check_slave invoked on an instance that is not a replication slave." return $OCF_ERR_GENERIC fi return $OCF_SUCCESS fi } check_slave() { # Checks slave status local rc new_master get_slave_info rc=$? if [ $rc -eq 0 ]; then # Did we receive an error other than max_connections? if [ $last_errno -ne 0 -a $last_errno -ne "$MYSQL_TOO_MANY_CONN_ERR" ]; then # Whoa. Replication ran into an error. This slave has # diverged from its master. Make sure this resource # doesn't restart in place. ocf_exit_reason "MySQL instance configured for replication, but replication has failed." ocf_log err "See $tmpfile for details" # Just pull the reader VIP away, killing MySQL here would be pretty evil # on a loaded server set_reader_attr 0 exit $OCF_SUCCESS fi # If we got max_connections, let's remove the vip if [ $last_errno -eq "$MYSQL_TOO_MANY_CONN_ERR" ]; then set_reader_attr 0 exit $OCF_SUCCESS fi if [ "$slave_io" != 'Yes' ]; then # Not necessarily a bad thing. The master may have # temporarily shut down, and the slave may just be # reconnecting. A warning can't hurt, though. ocf_log warn "MySQL Slave IO threads currently not running." # Sanity check, are we at least on the right master new_master=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f1` if [ "$master_host" != "$new_master" ]; then # Not pointing to the right master, not good, removing the VIPs set_reader_attr 0 exit $OCF_SUCCESS fi fi if [ "$slave_sql" != 'Yes' ]; then # We don't have a replication SQL thread running. Not a # good thing. Try to recoved by restarting the SQL thread # and remove reader vip. Prevent MySQL restart. ocf_exit_reason "MySQL Slave SQL threads currently not running." ocf_log err "See $tmpfile for details" # Remove reader vip set_reader_attr 0 # try to restart slave ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "START SLAVE" # Return success to prevent a restart exit $OCF_SUCCESS fi if ocf_is_true $OCF_RESKEY_evict_outdated_slaves; then # We're supposed to bail out if we lag too far # behind. Let's check our lag. if [ "$secs_behind" = "NULL" ] || [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then ocf_exit_reason "MySQL Slave is $secs_behind seconds behind master (allowed maximum: $OCF_RESKEY_max_slave_lag)." ocf_log err "See $tmpfile for details" # Remove reader vip set_reader_attr 0 exit $OCF_ERR_INSTALLED fi elif ocf_is_ms; then # Even if we're not set to evict lagging slaves, we can # still use the seconds behind master value to set our # master preference. local master_pref master_pref=$((${OCF_RESKEY_max_slave_lag}-${secs_behind})) if [ $master_pref -lt 0 ]; then # Sanitize a below-zero preference to just zero master_pref=0 fi $CRM_MASTER -v $master_pref fi # is the slave ok to have a VIP on it if [ "$secs_behind" = "NULL" ] || [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then set_reader_attr 0 else set_reader_attr 1 fi ocf_log debug "MySQL instance running as a replication slave" rm -f $tmpfile else # Instance produced an empty "SHOW SLAVE STATUS" output -- # instance is not a slave # TODO: Needs to handle when get_slave_info will return too many connections error rm -f $tmpfile ocf_exit_reason "check_slave invoked on an instance that is not a replication slave." exit $OCF_ERR_GENERIC fi } set_master() { local new_master master_log_file master_log_pos local master_params new_master=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f1` # Keep replication position get_slave_info if [ "$master_log_file" -a "$new_master" = "$master_host" ]; then # master_params=", MASTER_LOG_FILE='$master_log_file', \ # MASTER_LOG_POS=$master_log_pos" ocf_log info "Kept master pos for $master_host : $master_log_file:$master_log_pos" rm -f $tmpfile return else master_log_file=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f2` master_log_pos=`$CRM_ATTR_REPL_INFO --query -q | cut -d'|' -f3` if [ -n "$master_log_file" -a -n "$master_log_pos" ]; then master_params=", MASTER_LOG_FILE='$master_log_file', \ MASTER_LOG_POS=$master_log_pos" ocf_log info "Restored master pos for $new_master : $master_log_file:$master_log_pos" fi fi # Informs the MySQL server of the master to replicate # from. Accepts one mandatory argument which must contain the host # name of the new master host. The master must either be unchanged # from the laste master the slave replicated from, or freshly # reset with RESET MASTER. ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "CHANGE MASTER TO MASTER_HOST='$new_master', \ + MASTER_PORT=$OCF_RESKEY_replication_port, \ MASTER_USER='$OCF_RESKEY_replication_user', \ MASTER_PASSWORD='$OCF_RESKEY_replication_passwd' $master_params" rm -f $tmpfile } unset_master(){ # Instructs the MySQL server to stop replicating from a master # host. # If we're currently not configured to be replicating from any # host, then there's nothing to do. But we do log a warning as # no-one but the CRM should be touching the MySQL master/slave # configuration. if ! is_slave; then ocf_log warn "Attempted to unset the replication master on an instance that is not configured as a replication slave" return $OCF_SUCCESS fi local tmpfile tmpfile=`mktemp ${HA_RSCTMP}/unset_master.${OCF_RESOURCE_INSTANCE}.XXXXXX` # At this point, the master is read only so there should not be much binlogs to transfer # Let's wait for the last bits while true; do $MYSQL $MYSQL_OPTIONS_REPL \ -e 'SHOW PROCESSLIST\G' > $tmpfile if grep -i 'Waiting for master to send event' $tmpfile >/dev/null; then ocf_log info "MySQL slave has finished reading master binary log" break fi if grep -i 'Reconnecting after a failed master event read' $tmpfile >/dev/null; then ocf_log info "Master is down, no more binary logs to come" break fi if grep -i 'Connecting to master' $tmpfile >/dev/null; then ocf_log info "Master is down, no more binary logs to come" break fi if ! grep 'system user' $tmpfile >/dev/null; then ocf_log info "Slave is not running - not waiting to finish" break fi sleep 1 done # Now, stop the slave I/O thread and wait for relay log # processing to complete ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "STOP SLAVE IO_THREAD" if [ $? -gt 0 ]; then ocf_exit_reason "Error stopping slave IO thread" exit $OCF_ERR_GENERIC fi while true; do $MYSQL $MYSQL_OPTIONS_REPL \ -e 'SHOW PROCESSLIST\G' > $tmpfile if grep -i 'Has read all relay log' $tmpfile >/dev/null; then ocf_log info "MySQL slave has finished processing relay log" break fi if ! grep -q 'system user' $tmpfile; then ocf_log info "Slave not runnig - not waiting to finish" break fi ocf_log info "Waiting for MySQL slave to finish processing relay log" sleep 1 done rm -f $tmpfile # Now, stop all slave activity and unset the master host ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "STOP SLAVE" if [ $? -gt 0 ]; then ocf_exit_reason "Error stopping rest slave threads" exit $OCF_ERR_GENERIC fi ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "RESET SLAVE /*!50516 ALL */;" if [ $? -gt 0 ]; then ocf_exit_reason "Failed to reset slave" exit $OCF_ERR_GENERIC fi } # Start replication as slave start_slave() { ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "START SLAVE" } # Set the attribute controlling the readers VIP set_reader_attr() { local curr_attr_value curr_attr_value=$(get_reader_attr) if [ "$curr_attr_value" -ne "$1" ]; then $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $1 fi } # get the attribute controlling the readers VIP get_reader_attr() { local attr_value local rc attr_value=`$CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} --query -q` rc=$? if [ "$rc" -eq "0" ]; then echo $attr_value else echo -1 fi } # Stores data for MASTER STATUS from MySQL update_data_master_status() { master_status_file="${HA_RSCTMP}/master_status.${OCF_RESOURCE_INSTANCE}" $MYSQL $MYSQL_OPTIONS_REPL -e "SHOW MASTER STATUS\G" > $master_status_file } # Returns the specified value from the stored copy of SHOW MASTER STATUS. # should be call after update_data_master_status for tmpfile # Arguments: # $1 The value to get. get_master_status() { awk -v var="$1" '$1 == var ":" {print substr($0, index($0, ":") + 2)}' "$master_status_file" } # Determines what IP address is attached to the current host. The output of the # crm_attribute command looks like this: # scope=nodes name=IP value=10.2.2.161 # If the ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP node attribute is not defined, fallback is to uname -n # The ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP is the IP address that will be used for the # change master to command. get_local_ip() { local IP IP=`$CRM_ATTR -l forever -n ${INSTANCE_ATTR_NAME}_mysql_master_IP -q -G` if [ ! $? -eq 0 ]; then uname -n else echo $IP fi } ####################################################################### # Functions invoked by resource manager actions mysql_monitor() { local rc local status_loglevel="err" # Set loglevel to info during probe if ocf_is_probe; then status_loglevel="info" fi mysql_common_status $status_loglevel rc=$? # TODO: check max connections error # If status returned an error, return that immediately if [ $rc -ne $OCF_SUCCESS ]; then return $rc fi if [ $OCF_CHECK_LEVEL -gt 0 -a -n "$OCF_RESKEY_test_table" ]; then # Check if this instance is configured as a slave, and if so # check slave status if is_slave; then check_slave fi # Check for test table ocf_run -q $MYSQL $MYSQL_OPTIONS_TEST \ -e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table" rc=$? if [ $rc -ne 0 ]; then ocf_exit_reason "Failed to select from $test_table"; return $OCF_ERR_GENERIC; fi fi if ocf_is_ms && ! get_read_only; then ocf_log debug "MySQL monitor succeeded (master)"; return $OCF_RUNNING_MASTER else ocf_log debug "MySQL monitor succeeded"; return $OCF_SUCCESS fi } mysql_start() { local rc if ocf_is_ms; then # Initialize the ReaderVIP attribute, monitor will enable it set_reader_attr 0 fi mysql_common_status info if [ $? = $OCF_SUCCESS ]; then ocf_log info "MySQL already running" return $OCF_SUCCESS fi mysql_common_prepare_dirs # Uncomment to perform permission clensing # - not convinced this should be enabled by default # #chmod 0755 $OCF_RESKEY_datadir #chown -R $OCF_RESKEY_user $OCF_RESKEY_datadir #chgrp -R $OCF_RESKEY_group $OCF_RESKEY_datadir mysql_extra_params= if ocf_is_ms; then mysql_extra_params="--skip-slave-start" fi mysql_common_start $mysql_extra_params rc=$? if [ $rc != $OCF_SUCCESS ]; then return $rc fi if ocf_is_ms; then # We're configured as a stateful resource. We must start as # slave by default. At this point we don't know if the CRM has # already promoted a master. So, we simply start in read only # mode. set_read_only on # Now, let's see whether there is a master. We might be a new # node that is just joining the cluster, and the CRM may have # promoted a master before. master_host=`echo $OCF_RESKEY_CRM_meta_notify_master_uname|tr -d " "` if [ "$master_host" -a "$master_host" != ${NODENAME} ]; then ocf_log info "Changing MySQL configuration to replicate from $master_host." set_master start_slave if [ $? -ne 0 ]; then ocf_exit_reason "Failed to start slave" return $OCF_ERR_GENERIC fi else ocf_log info "No MySQL master present - clearing replication state" unset_master fi # We also need to set a master preference, otherwise Pacemaker # won't ever promote us in the absence of any explicit # preference set by the administrator. We choose a low # greater-than-zero preference. $CRM_MASTER -v 1 fi # Initial monitor action if [ -n "$OCF_RESKEY_test_table" -a -n "$OCF_RESKEY_test_user" -a -n "$OCF_RESKEY_test_passwd" ]; then OCF_CHECK_LEVEL=10 fi mysql_monitor rc=$? if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then ocf_exit_reason "Failed initial monitor action" return $rc fi ocf_log info "MySQL started" return $OCF_SUCCESS } mysql_stop() { if ocf_is_ms; then # clear preference for becoming master $CRM_MASTER -D # Remove VIP capability set_reader_attr 0 fi mysql_common_stop } mysql_promote() { local master_info if ( ! mysql_common_status err ); then return $OCF_NOT_RUNNING fi ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "STOP SLAVE" # Set Master Info in CIB, cluster level attribute update_data_master_status master_info="$(get_local_ip)|$(get_master_status File)|$(get_master_status Position)" ${CRM_ATTR_REPL_INFO} -v "$master_info" rm -f $tmpfile set_read_only off || return $OCF_ERR_GENERIC # Existing master gets a higher-than-default master preference, so # the cluster manager does not shuffle the master role around # unnecessarily $CRM_MASTER -v $((${OCF_RESKEY_max_slave_lag}+1)) # A master can accept reads set_reader_attr 1 return $OCF_SUCCESS } mysql_demote() { if ! mysql_common_status err; then return $OCF_NOT_RUNNING fi # Return master preference to default, so the cluster manager gets # a chance to select a new master $CRM_MASTER -v 1 } mysql_notify() { # If not configured as a Stateful resource, we make no sense of # notifications. if ! ocf_is_ms; then ocf_log info "This agent makes no use of notifications unless running in master/slave mode." return $OCF_SUCCESS fi local type_op type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" ocf_log debug "Received $type_op notification." case "$type_op" in 'pre-promote') # Nothing to do now here, new replication info not yet published ;; 'post-promote') # The master has completed its promotion. Now is a good # time to check whether our replication slave is working # correctly. master_host=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname|tr -d " "` if [ "$master_host" = ${NODENAME} ]; then ocf_log info "This will be the new master, ignoring post-promote notification." else ocf_log info "Resetting replication" unset_master if [ $? -ne 0 ]; then return $OCF_ERR_GENERIC fi ocf_log info "Changing MySQL configuration to replicate from $master_host" set_master if [ $? -ne 0 ]; then return $OCF_ERR_GENERIC fi start_slave if [ $? -ne 0 ]; then ocf_exit_reason "Failed to start slave" return $OCF_ERR_GENERIC fi fi return $OCF_SUCCESS ;; 'pre-demote') demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "` if [ $demote_host = ${NODENAME} ]; then ocf_log info "post-demote notification for $demote_host" set_read_only on if [ $? -ne 0 ]; then ocf_exit_reason "Failed to set read-only"; return $OCF_ERR_GENERIC; fi # Must kill all existing user threads because they are still Read/write # in order for the slaves to complete the read of binlogs local tmpfile tmpfile=`mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX` $MYSQL $MYSQL_OPTIONS_REPL \ -e "SHOW PROCESSLIST" > $tmpfile for thread in `awk '$0 !~ /Binlog Dump|system user|event_scheduler|SHOW PROCESSLIST/ && $0 ~ /^[0-9]/ {print $1}' $tmpfile` do ocf_run $MYSQL $MYSQL_OPTIONS_REPL \ -e "KILL ${thread}" done else ocf_log info "Ignoring post-demote notification execpt for my own demotion." fi return $OCF_SUCCESS ;; 'post-demote') demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "` if [ $demote_host = ${NODENAME} ]; then ocf_log info "Ignoring post-demote notification for my own demotion." return $OCF_SUCCESS fi ocf_log info "post-demote notification for $demote_host." # The former master has just been gracefully demoted. unset_master ;; *) return $OCF_SUCCESS ;; esac } ####################################################################### ########################################################################## # If DEBUG_LOG is set, make this resource agent easy to debug: set up the # debug log and direct all output to it. Otherwise, redirect to /dev/null. # The log directory must be a directory owned by root, with permissions 0700, # and the log must be writable and not a symlink. ########################################################################## DEBUG_LOG="/tmp/mysql.ocf.ra.debug/log" if [ "${DEBUG_LOG}" -a -w "${DEBUG_LOG}" -a ! -L "${DEBUG_LOG}" ]; then DEBUG_LOG_DIR="${DEBUG_LOG%/*}" if [ -d "${DEBUG_LOG_DIR}" ]; then exec 9>>"$DEBUG_LOG" exec 2>&9 date >&9 echo "$*" >&9 env | grep OCF_ | sort >&9 set -x else exec 9>/dev/null fi fi case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage|help) usage exit $OCF_SUCCESS;; esac mysql_common_validate rc=$? LSB_STATUS_STOPPED=3 if [ $rc -ne 0 ]; then case "$1" in stop) ;; monitor) mysql_common_status "info" if [ $? -eq $OCF_SUCCESS ]; then # if validatation fails and pid is active, always treat this as an error ocf_exit_reason "environment validation failed, active pid is in unknown state." exit $OCF_ERR_GENERIC fi # validation failed and pid is not active, it's safe to say this instance is inactive. exit $OCF_NOT_RUNNING;; status) exit $LSB_STATUS_STOPPED;; *) exit $rc;; esac fi # What kind of method was invoked? case "$1" in start) mysql_start;; stop) mysql_stop;; status) mysql_common_status err;; monitor) mysql_monitor;; promote) mysql_promote;; demote) mysql_demote;; notify) mysql_notify;; validate-all) exit $OCF_SUCCESS;; *) usage exit $OCF_ERR_UNIMPLEMENTED;; esac # vi:sw=4:ts=4:et: diff --git a/heartbeat/nagios b/heartbeat/nagios index e61306cf4..d2067bc38 100755 --- a/heartbeat/nagios +++ b/heartbeat/nagios @@ -1,246 +1,246 @@ #!/bin/sh # # License: GNU General Public License (GPL) # (c) 2015 T.J. Yang, O. Albrigtsen # and Linux-HA contributors # # ----------------------------------------------------------------------------- # O C F R E S O U R C E S C R I P T S P E C I F I C A T I O N # ----------------------------------------------------------------------------- # # NAME # nagios : OCF resource agent script for Nagios Server # # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults OCF_RESKEY_user_default="nagios" OCF_RESKEY_group_default="nagios" OCF_RESKEY_binary_default="/usr/sbin/nagios" OCF_RESKEY_config_default="/etc/nagios/nagios.cfg" OCF_RESKEY_log_default="/var/log/nagios/nagios.log" OCF_RESKEY_retention_default="/var/log/nagios/retention.dat" OCF_RESKEY_command_default="/var/log/nagios/rw/nagios.cmd" OCF_RESKEY_pid_default="/var/run/nagios.pid" : ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} : ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} : ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} : ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} : ${OCF_RESKEY_log=${OCF_RESKEY_log_default}} : ${OCF_RESKEY_retention=${OCF_RESKEY_retention_default}} : ${OCF_RESKEY_command=${OCF_RESKEY_command_default}} : ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} nagios_usage() { cat < 0.75 OCF Resource script for Nagios 3.x or 4.x. It manages a Nagios instance as a HA resource. Nagios resource agent User running Nagios daemon (for file permissions) Nagios user Group running Nagios daemon (for file permissions) Nagios group Location of the Nagios binary Nagios binary Configuration file Nagios config Location of the Nagios log Nagios log Location of the Nagios retention file Nagios retention file Location of the Nagios external command file Nagios command file Location of the Nagios pid/lock Nagios pid file END } nagios_start() { nagios_validate_all rc=$? if [ $rc -ne 0 ]; then return $rc fi - # if resource is already running,no need to continue code after this. if nagios_monitor; then ocf_log info "Nagios is already running" return $OCF_SUCCESS fi # Remove ${OCF_RESKEY_pid} if it exists - rm -f ${OCF_RESKEY_pid} + rm -f "${OCF_RESKEY_pid}" ocf_run -q touch ${OCF_RESKEY_log} ${OCF_RESKEY_retention} ${OCF_RESKEY_pid} chown ${OCF_RESKEY_user}:${OCF_RESKEY_group} ${OCF_RESKEY_log} ${OCF_RESKEY_retention} ${OCF_RESKEY_pid} - rm -f ${OCF_RESKEY_command} + rm -f "${OCF_RESKEY_command}" + [ -x /sbin/restorecon ] && /sbin/restorecon ${OCF_RESKEY_pid} ocf_run -q ${OCF_RESKEY_binary} -d ${OCF_RESKEY_config} while ! nagios_monitor; do sleep 1 done - if [ $? -eq "0" ]; then + if [ $? -eq 0 ]; then ocf_log info "Nagios started" return ${OCF_SUCCESS} fi return $OCF_SUCCESS } nagios_stop() { nagios_monitor - if [ "$?" -ne "$OCF_SUCCESS" ]; then + if [ $? -ne $OCF_SUCCESS ]; then # Currently not running. Nothing to do. ocf_log info "Resource is already stopped" rm -f ${OCF_RESKEY_pid} return $OCF_SUCCESS fi kill `cat ${OCF_RESKEY_pid}` # Wait for process to stop while nagios_monitor; do sleep 1 done - + return $OCF_SUCCESS } nagios_monitor(){ ocf_pidfile_status ${OCF_RESKEY_pid} > /dev/null 2>&1 case "$?" in 0) rc=$OCF_SUCCESS ;; 1|2) rc=$OCF_NOT_RUNNING ;; *) rc=$OCF_ERR_GENERIC ;; esac return $rc } nagios_validate_all(){ - check_binary ${OCF_RESKEY_binary} - - if [ ! -f ${OCF_RESKEY_config} ]; then + check_binary "${OCF_RESKEY_binary}" + + if [ ! -f "${OCF_RESKEY_config}" ]; then ocf_exit_reason "Configuration file ${OCF_RESKEY_config} not found" return ${OCF_ERR_INSTALLED} fi - - ${OCF_RESKEY_binary} -v ${OCF_RESKEY_config} > /dev/null 2>&1; - if [ $? -ne "0" ]; then + + ${OCF_RESKEY_binary} -v ${OCF_RESKEY_config} >/dev/null 2>&1 + if [ $? -ne 0 ]; then ocf_exit_reason "Configuration check failed" return ${OCF_ERR_INSTALLED} fi } # **************************** MAIN SCRIPT ************************************ # Make sure meta-data and usage always succeed case $__OCF_ACTION in meta-data) nagios_meta_data exit $OCF_SUCCESS ;; usage|help) nagios_usage exit $OCF_SUCCESS ;; esac # This OCF agent script need to be run as root user. if ! ocf_is_root; then echo "$0 agent script need to be run as root user." ocf_log debug "$0 agent script need to be run as root user." exit $OCF_ERR_GENERIC fi # Translate each action into the appropriate function call case $__OCF_ACTION in start) nagios_start;; stop) nagios_stop;; status|monitor) nagios_monitor;; validate-all) nagios_validate_all;; *) nagios_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac rc=$? exit $rc - + # End of this script diff --git a/heartbeat/nfsserver b/heartbeat/nfsserver index 3cec5c8e7..d1e6259e3 100755 --- a/heartbeat/nfsserver +++ b/heartbeat/nfsserver @@ -1,660 +1,866 @@ #!/bin/sh # nfsserver # # Description: Manages nfs server as OCF resource # by hxinwei@gmail.com # License: GNU General Public License v2 (GPLv2) and later if [ -n "$OCF_DEBUG_LIBRARY" ]; then . $OCF_DEBUG_LIBRARY else : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs fi if is_redhat_based; then . ${OCF_FUNCTIONS_DIR}/nfsserver-redhat.sh fi +DEFAULT_INIT_SCRIPT_LIST="/etc/init.d/nfsserver /etc/init.d/nfs /etc/init.d/nfs-kernel-server" DEFAULT_INIT_SCRIPT="/etc/init.d/nfsserver" -if ! [ -f $DEFAULT_INIT_SCRIPT ]; then - # On some systems, the script is just called nfs - DEFAULT_INIT_SCRIPT="/etc/init.d/nfs" -fi +for script in $DEFAULT_INIT_SCRIPT_LIST +do + if [ -f $script -a -x $script ]; then + DEFAULT_INIT_SCRIPT=$script + break + fi +done DEFAULT_NOTIFY_CMD=`which sm-notify` DEFAULT_NOTIFY_CMD=${DEFAULT_NOTIFY_CMD:-"/sbin/sm-notify"} DEFAULT_NOTIFY_FOREGROUND="false" DEFAULT_RPCPIPEFS_DIR="/var/lib/nfs/rpc_pipefs" EXEC_MODE=0 SELINUX_ENABLED=-1 STATD_PATH="/var/lib/nfs" STATD_DIR="" nfsserver_meta_data() { cat < 1.0 Nfsserver helps to manage the Linux nfs server as a failover-able resource in Linux-HA. It depends on Linux specific NFS implementation details, so is considered not portable to other platforms yet. Manages an NFS server The default init script shipped with the Linux distro. The nfsserver resource agent offloads the start/stop/monitor work to the init script because the procedure to start/stop/monitor nfsserver varies on different Linux distro. In the event that this option is not set, this agent will attempt to use an init script at this location, ${DEFAULT_INIT_SCRIPT}, or detect a systemd unit-file to use in the event that no init script is detected. Init script for nfsserver Do not send reboot notifications to NFSv3 clients during server startup. Disable NFSv3 server reboot notifications Keeps the sm-notify attached to its controlling terminal and running in the foreground. Keeps the notify tool running in the foreground. Specifies the length of sm-notify retry time, in minutes, to continue retrying notifications to unresponsive hosts. If this option is not specified, sm-notify attempts to send notifications for 15 minutes. Specifying a value of 0 causes sm-notify to continue sending notifications to unresponsive peers until it is manually killed. Specifies the length of sm-notify retry time (minutes). Comma separated list of floating IP addresses used to access the nfs service IP addresses. The nfsserver resource agent will save nfs related information in this specific directory. And this directory must be able to fail-over before nfsserver itself. Directory to store nfs server related information. The mount point for the sunrpc file system. Default is $DEFAULT_RPCPIPEFS_DIR. This script will mount (bind) nfs_shared_infodir on /var/lib/nfs/ (cannot be changed), and this script will mount the sunrpc file system on $DEFAULT_RPCPIPEFS_DIR (default, can be changed by this parameter). If you want to move only rpc_pipefs/ (e.g. to keep rpc_pipefs/ local) from default, please set this value. The mount point for the sunrpc file system. $( is_redhat_based && nfsserver_redhat_meta_data ) END return $OCF_SUCCESS } nfsserver_usage() { cat < /dev/null 2>&1 && selinuxenabled SELINUX_ENABLED=$? if [ $SELINUX_ENABLED -eq 0 ]; then export SELINUX_LABEL="$(ls -ldZ $STATD_PATH | cut -f4 -d' ')" fi ## # EXEC_MODE values # 1 user init script or default init script # 2 systemd (with nfs-lock.service) # 3 systemd (with rpc-statd.service) # # On error, this function will terminate the process # with error code $OCF_ERR_INSTALLED ## set_exec_mode() { ## # If EXEC_MODE is already set, we don't need to run this function again. ## if [ $EXEC_MODE -ne 0 ]; then return 0; fi ## # If the user defined an init script, It must exist for us to continue ## if [ -n "$OCF_RESKEY_nfs_init_script" ]; then # check_binary will exit the process if init script does not exist check_binary ${OCF_RESKEY_nfs_init_script} EXEC_MODE=1 return 0 fi ## # Check to see if the default init script exists, if so we'll use that. ## if which $DEFAULT_INIT_SCRIPT > /dev/null 2>&1; then OCF_RESKEY_nfs_init_script=$DEFAULT_INIT_SCRIPT EXEC_MODE=1 return 0 fi ## # Attempt systemd (with nfs-lock.service). ## if which systemctl > /dev/null 2>&1; then if systemctl list-unit-files | grep nfs-server > /dev/null && systemctl list-unit-files | grep nfs-lock > /dev/null; then EXEC_MODE=2 # when using systemd, the nfs-lock service file handles nfsv3 locking daemons for us. return 0 fi fi ## # Attempt systemd (with rpc-statd.service). ## if which systemctl > /dev/null 2>&1; then if systemctl list-unit-files | grep nfs-server > /dev/null && systemctl list-unit-files | grep rpc-statd > /dev/null; then EXEC_MODE=3 return 0 fi fi ocf_exit_reason "No init script or systemd unit file detected for nfs server" exit $OCF_ERR_INSTALLED } ## # wrapper for init script and systemd calls. ## nfs_exec() { local cmd=$1 + local svc=$2 set_exec_mode case $EXEC_MODE in 1) ${OCF_RESKEY_nfs_init_script} $cmd;; - 2) systemctl $cmd nfs-server.service ;; - 3) systemctl $cmd nfs-server.service ;; + 2) if ! echo $svc | grep -q "\."; then + svc="${svc}.service" + fi + systemctl $cmd $svc + ;; + 3) if ! echo $svc | grep -q "\."; then + svc="${svc}.service" + fi + systemctl $cmd $svc + ;; esac } v3locking_exec() { local cmd=$1 set_exec_mode if [ $EXEC_MODE -eq 2 ]; then - systemctl $cmd nfs-lock.service + nfs_exec $cmd nfs-lock.service elif [ $EXEC_MODE -eq 3 ]; then - systemctl $cmd rpc-statd.service + nfs_exec $cmd rpc-statd.service else case $cmd in start) locking_start;; stop) locking_stop;; status) locking_status;; esac fi } +nfsserver_systemd_monitor() +{ + local threads_num + local rc + local fn + + ocf_log debug "Status: rpcbind" + rpcinfo > /dev/null 2>&1 + rc=$? + if [ "$rc" -ne "0" ]; then + ocf_exit_reason "rpcbind is not running" + return $OCF_NOT_RUNNING + fi + + ocf_log debug "Status: nfs-mountd" + rpcinfo -t localhost 100005 > /dev/null 2>&1 + rc=$? + if [ "$rc" -ne "0" ]; then + ocf_exit_reason "nfs-mountd is not running" + return $OCF_NOT_RUNNING + fi + + ocf_log debug "Status: nfs-idmapd" + fn=`mktemp` + nfs_exec status nfs-idmapd > $fn 2>&1 + rc=$? + ocf_log debug "$(cat $fn)" + rm -f $fn + if [ "$rc" -ne "0" ]; then + ocf_exit_reason "nfs-idmapd is not running" + return $OCF_NOT_RUNNING + fi + + ocf_log debug "Status: rpc-statd" + rpcinfo -t localhost 100024 > /dev/null 2>&1 + rc=$? + if [ "$rc" -ne "0" ]; then + ocf_exit_reason "rpc-statd is not running" + return $OCF_NOT_RUNNING + fi + + nfs_exec is-active nfs-server + rc=$? + + # Now systemctl is-active can't detect the failure of kernel process like nfsd. + # So, if the return value of systemctl is-active is 0, check the threads number + # to make sure the process is running really. + # /proc/fs/nfsd/threads has the numbers of the nfsd threads. + if [ $rc -eq 0 ]; then + threads_num=`cat /proc/fs/nfsd/threads 2>/dev/null` + if [ $? -eq 0 ]; then + if [ $threads_num -gt 0 ]; then + return $OCF_SUCCESS + else + return 3 + fi + else + return $OCF_ERR_GENERIC + fi + fi + + return $rc +} + nfsserver_monitor () { + local fn + + set_exec_mode fn=`mktemp` - nfs_exec status > $fn 2>&1 + case $EXEC_MODE in + 1) nfs_exec status nfs-server > $fn 2>&1;; + [23]) nfsserver_systemd_monitor > $fn 2>&1;; + esac rc=$? ocf_log debug "$(cat $fn)" rm -f $fn #Adapte LSB status code to OCF return code if [ $rc -eq 0 ]; then # don't report success if nfs servers are up # without locking daemons. v3locking_exec "status" rc=$? if [ $rc -ne 0 ]; then ocf_exit_reason "NFS server is up, but the locking daemons are down" rc=$OCF_ERR_GENERIC fi return $rc - elif [ $rc -eq 3 ]; then + elif [ $rc -eq 3 ] || [ $rc -eq $OCF_NOT_RUNNING ]; then return $OCF_NOT_RUNNING else return $OCF_ERR_GENERIC fi } prepare_directory () { if [ -z "$fp" ]; then return fi [ -d "$fp" ] || mkdir -p $fp [ -d "$rpcpipefs_make_dir" ] || mkdir -p $rpcpipefs_make_dir [ -d "$fp/v4recovery" ] || mkdir -p $fp/v4recovery [ -d "$fp/$STATD_DIR" ] || mkdir -p "$fp/$STATD_DIR" [ -d "$fp/$STATD_DIR/sm" ] || mkdir -p "$fp/$STATD_DIR/sm" [ -d "$fp/$STATD_DIR/sm.ha" ] || mkdir -p "$fp/$STATD_DIR/sm.ha" [ -d "$fp/$STATD_DIR/sm.bak" ] || mkdir -p "$fp/$STATD_DIR/sm.bak" [ -n "`id -u rpcuser 2>/dev/null`" -a "`id -g rpcuser 2>/dev/null`" ] && chown -R rpcuser.rpcuser "$fp/$STATD_DIR" [ -f "$fp/etab" ] || touch "$fp/etab" [ -f "$fp/xtab" ] || touch "$fp/xtab" [ -f "$fp/rmtab" ] || touch "$fp/rmtab" dd if=/dev/urandom of=$fp/$STATD_DIR/state bs=1 count=4 >/dev/null 2>&1 [ -n "`id -u rpcuser 2>/dev/null`" -a "`id -g rpcuser 2>/dev/null`" ] && chown rpcuser.rpcuser "$fp/$STATD_DIR/state" [ $SELINUX_ENABLED -eq 0 ] && chcon -R "$SELINUX_LABEL" "$fp" } is_bound () { if mount | grep -q "on $1 type"; then return 0 fi return 1 } bind_tree () { if [ -z "$fp" ]; then return fi if is_bound /var/lib/nfs; then ocf_log debug "$fp is already bound to /var/lib/nfs" return 0 fi mount --bind $fp /var/lib/nfs [ $SELINUX_ENABLED -eq 0 ] && restorecon /var/lib/nfs } unbind_tree () { if `mount | grep -q " on $rpcpipefs_umount_dir"`; then umount -t rpc_pipefs $rpcpipefs_umount_dir fi if is_bound /var/lib/nfs; then umount /var/lib/nfs fi } binary_status() { local binary=$1 local pid pid=$(pgrep ${binary}) case $? in 0) echo "$pid" return $OCF_SUCCESS;; 1) return $OCF_NOT_RUNNING;; *) return $OCF_ERR_GENERIC;; esac } locking_status() { binary_status "rpc.statd" > /dev/null 2>&1 } locking_start() { local ret=$OCF_SUCCESS ocf_log info "Starting rpc.statd." rpc.statd $STATDARG ret=$? if [ $ret -ne 0 ]; then ocf_log err "Failed to start rpc.statd" return $ret fi [ -d /var/lock/subsys ] && touch /var/lock/subsys/nfslock return $ret } terminate() { local pids local i=0 while : ; do pids=$(binary_status $1) [ -z "$pids" ] && return 0 kill $pids sleep 1 i=$((i + 1)) [ $i -gt 3 ] && return 1 done } killkill() { local pids local i=0 while : ; do pids=$(binary_status $1) [ -z "$pids" ] && return 0 kill -9 $pids sleep 1 i=$((i + 1)) [ $i -gt 3 ] && return 1 done } stop_process() { local process=$1 ocf_log info "Stopping $process" if terminate $process; then ocf_log debug "$process is stopped" else if killkill $process; then ocf_log debug "$process is stopped" else ocf_log debug "Failed to stop $process" return 1 fi fi return 0 } locking_stop() { ret=0 # sm-notify can prevent umount of /var/lib/nfs/statd if # it is still trying to notify unresponsive clients. stop_process sm-notify if [ $? -ne 0 ]; then ret=$OCF_ERR_GENERIC fi stop_process rpc.statd if [ $? -ne 0 ]; then ret=$OCF_ERR_GENERIC fi return $ret } notify_locks() { if ocf_is_true "$OCF_RESKEY_nfs_no_notify"; then # we've been asked not to notify clients return; fi # run in foreground, if requested if ocf_is_true "$OCF_RESKEY_nfs_notify_foreground"; then opts="-d" fi if [ -n "$OCF_RESKEY_nfs_smnotify_retry_time" ]; then opts="$opts -m $OCF_RESKEY_nfs_smnotify_retry_time" fi if [ -n "$OCF_RESKEY_statd_outgoing_port" ]; then opts="$opts -p $OCF_RESKEY_statd_outgoing_port" fi # forces re-notificaiton regardless if notifies have already gone out opts="$opts -f" ocf_log info "executing sm-notify" if [ -n "$OCF_RESKEY_nfs_ip" ]; then for ip in `echo ${OCF_RESKEY_nfs_ip} | sed 's/,/ /g'`; do cp -rpfn $STATD_PATH/sm.ha/* $STATD_PATH/ > /dev/null 2>&1 sm-notify $opts -v $ip done else sm-notify $opts fi } nfsserver_start () { local rc; + local fn if nfsserver_monitor; then ocf_log debug "NFS server is already started" return $OCF_SUCCESS fi is_redhat_based && set_env_args prepare_directory bind_tree # remove the sm-notify pid so sm-notify will be allowed to run again without requiring a reboot. rm -f /var/run/sm-notify.pid # # Synchronize these before starting statd # cp -rpfn $STATD_PATH/sm.ha/* $STATD_PATH/ > /dev/null 2>&1 rm -rf $STATD_PATH/sm.ha/* > /dev/null 2>&1 cp -rpf $STATD_PATH/sm $STATD_PATH/sm.bak /var/lib/nfs/state $STATD_PATH/sm.ha > /dev/null 2>&1 ocf_log info "Starting NFS server ..." # mounts /proc/fs/nfsd for us lsmod | grep -q nfsd if [ $? -ne 0 ]; then modprobe nfsd fi + # systemd + case $EXEC_MODE in + [23]) nfs_exec start rpcbind + local i=1 + while : ; do + ocf_log info "Start: rpcbind i: $i" + rpcinfo > /dev/null 2>&1 + rc=$? + if [ "$rc" -eq "0" ]; then + break; + fi + sleep 1 + i=$((i + 1)) + done + ;; + esac + # check to see if we need to start rpc.statd v3locking_exec "status" if [ $? -ne $OCF_SUCCESS ]; then v3locking_exec "start" rc=$? if [ $rc -ne 0 ]; then ocf_exit_reason "Failed to start NFS server locking daemons" return $rc fi else ocf_log info "rpc.statd already up" fi + # systemd + case $EXEC_MODE in + [23]) nfs_exec start nfs-mountd + local i=1 + while : ; do + ocf_log info "Start: nfs-mountd i: $i" + rpcinfo -t localhost 100005 > /dev/null 2>&1 + rc=$? + if [ "$rc" -eq "0" ]; then + break; + fi + sleep 1 + i=$((i + 1)) + done + + nfs_exec start nfs-idmapd + local i=1 + while : ; do + ocf_log info "Start: nfs-idmapd i: $i" + fn=`mktemp` + nfs_exec status nfs-idmapd > $fn 2>&1 + rc=$? + ocf_log debug "$(cat $fn)" + rm -f $fn + if [ "$rc" -eq "0" ]; then + break; + fi + sleep 1 + i=$((i + 1)) + done + + nfs_exec start rpc-statd + local i=1 + while : ; do + ocf_log info "Start: rpc-statd i: $i" + rpcinfo -t localhost 100024 > /dev/null 2>&1 + rc=$? + if [ "$rc" -eq "0" ]; then + break; + fi + sleep 1 + i=$((i + 1)) + done + esac + + fn=`mktemp` - nfs_exec start > $fn 2>&1 + nfs_exec start nfs-server > $fn 2>&1 rc=$? ocf_log debug "$(cat $fn)" rm -f $fn if [ $rc -ne 0 ]; then ocf_exit_reason "Failed to start NFS server" return $rc - fi + fi + + tfn="/proc/fs/nfsd/threads" + if [ ! -f "$tfn" ] || [ "$(cat $tfn)" -le "0" ]; then + ocf_exit_reason "Failed to start NFS server: /proc/fs/nfsd/threads" + return $OCF_ERR_GENERIC + fi notify_locks ocf_log info "NFS server started" return $OCF_SUCCESS } nfsserver_stop () { + local fn + ocf_log info "Stopping NFS server ..." # backup the current sm state information to the ha folder before stopping. # the ha folder will be synced after startup, restoring the statd client state rm -rf $STATD_PATH/sm.ha/* > /dev/null 2>&1 cp -rpf $STATD_PATH/sm $STATD_PATH/sm.bak /var/lib/nfs/state $STATD_PATH/sm.ha > /dev/null 2>&1 fn=`mktemp` - nfs_exec stop > $fn 2>&1 + nfs_exec stop nfs-server > $fn 2>&1 rc=$? ocf_log debug "$(cat $fn)" rm -f $fn + if [ $rc -ne 0 ]; then + ocf_exit_reason "Failed to stop NFS server" + return $rc + fi + + # systemd + case $EXEC_MODE in + [23]) ocf_log info "Stop: threads" + tfn="/proc/fs/nfsd/threads" + if [ -f "$tfn" ] && [ "$(cat $tfn)" -gt "0" ]; then + ocf_exit_reason "NFS server failed to stop: /proc/fs/nfsd/threads" + return $OCF_ERR_GENERIC + fi + + nfs_exec stop rpc-statd > /dev/null 2>&1 + ocf_log info "Stop: rpc-statd" + rpcinfo -t localhost 100024 > /dev/null 2>&1 + rc=$? + if [ "$rc" -eq "0" ]; then + ocf_exit_reason "Failed to stop rpc-statd" + return $OCF_ERR_GENERIC + fi + + nfs_exec stop nfs-idmapd > /dev/null 2>&1 + ocf_log info "Stop: nfs-idmapd" + fn=`mktemp` + nfs_exec status nfs-idmapd > $fn 2>&1 + rc=$? + ocf_log debug "$(cat $fn)" + rm -f $fn + if [ "$rc" -eq "0" ]; then + ocf_exit_reason "Failed to stop nfs-idmapd" + return $OCF_ERR_GENERIC + fi + + nfs_exec stop nfs-mountd > /dev/null 2>&1 + ocf_log info "Stop: nfs-mountd" + rpcinfo -t localhost 100005 > /dev/null 2>&1 + rc=$? + if [ "$rc" -eq "0" ]; then + ocf_exit_reason "Failed to stop nfs-mountd" + return $OCF_ERR_GENERIC + fi + esac + + v3locking_exec "stop" if [ $? -ne 0 ]; then ocf_exit_reason "Failed to stop NFS locking daemons" rc=$OCF_ERR_GENERIC fi - if [ $rc -eq 0 ]; then - unbind_tree - ocf_log info "NFS server stopped" - else - ocf_exit_reason "Failed to stop NFS server" - fi - return $rc + # systemd + case $EXEC_MODE in + [23]) nfs_exec stop rpcbind > /dev/null 2>&1 + ocf_log info "Stop: rpcbind" + + nfs_exec stop rpc-gssd > /dev/null 2>&1 + ocf_log info "Stop: rpc-gssd" + esac + + unbind_tree + ocf_log info "NFS server stopped" + return 0 } nfsserver_validate () { ## # set_exec_mode will exit if nfs server is not installed ## set_exec_mode check_binary ${OCF_RESKEY_nfs_notify_cmd} if [ -n "$OCF_RESKEY_CRM_meta_clone" ] && [ -n "$OCF_RESKEY_nfs_shared_infodir" ]; then ocf_exit_reason "This RA does not support clone mode when a shared info directory is in use." exit $OCF_ERR_CONFIGURED fi if [ -n "$OCF_RESKEY_nfs_smnotify_retry_time" ]; then if ! ocf_is_decimal "$OCF_RESKEY_nfs_smnotify_retry_time"; then ocf_exit_reason "Invalid nfs_smnotify_retry_time [$OCF_RESKEY_nfs_smnotify_retry_time]" exit $OCF_ERR_CONFIGURED fi fi case ${OCF_RESKEY_nfs_notify_cmd##*/} in sm-notify|rpc.statd) ;; *) ocf_exit_reason "Invalid nfs_notify_cmd [$OCF_RESKEY_nfs_notify_cmd]" exit $OCF_ERR_CONFIGURED ;; esac return $OCF_SUCCESS } nfsserver_validate case $__OCF_ACTION in start) nfsserver_start ;; stop) nfsserver_stop ;; monitor) nfsserver_monitor ;; validate-all) exit $OCF_SUCCESS ;; *) nfsserver_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/heartbeat/ocf-directories.in b/heartbeat/ocf-directories.in index 6e0a9d542..8d7077627 100644 --- a/heartbeat/ocf-directories.in +++ b/heartbeat/ocf-directories.in @@ -1,22 +1,22 @@ # Binaries and binary options for use in Resource Agents prefix=@prefix@ exec_prefix=@exec_prefix@ : ${INITDIR:=@INITDIR@} : ${HA_DIR:=@sysconfdir@/ha.d} : ${HA_RCDIR:=$HA_DIR/rc.d} : ${HA_CONFDIR=$HA_DIR/conf} : ${HA_CF:=$HA_DIR/ha.cf} : ${HA_VARLIB:=@localstatedir@/lib/heartbeat} : ${HA_RSCTMP:=@HA_RSCTMPDIR@} : ${HA_RSCTMP_OLD:=@HA_VARRUNDIR@/heartbeat/rsctmp} : ${HA_FIFO:=@localstatedir@/lib/heartbeat/fifo} : ${HA_BIN:=@libexecdir@/heartbeat} : ${HA_SBIN_DIR:=@sbindir@} : ${HA_DATEFMT:="%Y/%m/%d_%T "} : ${HA_DEBUGLOG:=/dev/null} : ${HA_RESOURCEDIR:=$HA_DIR/resource.d} : ${HA_DOCDIR:=@datadir@/doc/heartbeat} : ${__SCRIPT_NAME:=`basename $0`} -: ${HA_VARRUN:=@localstatedir@/run/} -: ${HA_VARLOCK:=@localstatedir@/lock/subsys/} +: ${HA_VARRUN:=@localstatedir@/run} +: ${HA_VARLOCK:=@localstatedir@/lock/subsys} diff --git a/heartbeat/ocf-shellfuncs.in b/heartbeat/ocf-shellfuncs.in index e00df4487..6d9669d17 100644 --- a/heartbeat/ocf-shellfuncs.in +++ b/heartbeat/ocf-shellfuncs.in @@ -1,923 +1,922 @@ # # # Common helper functions for the OCF Resource Agents supplied by # heartbeat. # # Copyright (c) 2004 SUSE LINUX AG, Lars Marowsky-Brée # All Rights Reserved. # # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Build version: $Format:%H$ # TODO: Some of this should probably split out into a generic OCF # library for shell scripts, but for the time being, we'll just use it # ourselves... # # TODO wish-list: # - Generic function for evaluating version numbers # - Generic function(s) to extract stuff from our own meta-data # - Logging function which automatically adds resource identifier etc # prefixes # TODO: Move more common functionality for OCF RAs here. # # This was common throughout all legacy Heartbeat agents unset LC_ALL; export LC_ALL unset LANGUAGE; export LANGUAGE __SCRIPT_NAME=`basename $0` if [ -z "$OCF_ROOT" ]; then : ${OCF_ROOT=@OCF_ROOT_DIR@} fi if [ "$OCF_FUNCTIONS_DIR" = ${OCF_ROOT}/resource.d/heartbeat ]; then # old unset OCF_FUNCTIONS_DIR fi : ${OCF_FUNCTIONS_DIR:=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-binaries . ${OCF_FUNCTIONS_DIR}/ocf-returncodes . ${OCF_FUNCTIONS_DIR}/ocf-directories . ${OCF_FUNCTIONS_DIR}/ocf-rarun . ${OCF_FUNCTIONS_DIR}/ocf-distro # Define OCF_RESKEY_CRM_meta_interval in case it isn't already set, # to make sure that ocf_is_probe() always works : ${OCF_RESKEY_CRM_meta_interval=0} ocf_is_root() { if [ X`id -u` = X0 ]; then true else false fi } ocf_maybe_random() { local rnd="$RANDOM" # Something sane-ish in case a shell doesn't support $RANDOM [ -n "$rnd" ] || rnd=$$ echo $rnd } # Portability comments: # o The following rely on Bourne "sh" pattern-matching, which is usually # that for filename generation (note: not regexp). # o The "*) true ;;" clause is probably unnecessary, but is included # here for completeness. # o The negation in the pattern uses "!". This seems to be common # across many OSes (whereas the alternative "^" fails on some). # o If an OS is encountered where this negation fails, then a possible # alternative would be to replace the function contents by (e.g.): # [ -z "`echo $1 | tr -d '[0-9]'`" ] # ocf_is_decimal() { case "$1" in ""|*[!0-9]*) # empty, or at least one non-decimal false ;; *) true ;; esac } ocf_is_true() { case "$1" in yes|true|1|YES|TRUE|ja|on|ON) true ;; *) false ;; esac } ocf_is_hex() { case "$1" in ""|*[!0-9a-fA-F]*) # empty, or at least one non-hex false ;; *) true ;; esac } ocf_is_octal() { case "$1" in ""|*[!0-7]*) # empty, or at least one non-octal false ;; *) true ;; esac } __ocf_set_defaults() { __OCF_ACTION="$1" # Return to sanity for the agents... unset LANG LC_ALL=C export LC_ALL # TODO: Review whether we really should source this. Or rewrite # to match some emerging helper function syntax...? This imports # things which no OCF RA should be using... # Strip the OCF_RESKEY_ prefix from this particular parameter if [ -z "$OCF_RESKEY_OCF_CHECK_LEVEL" ]; then : ${OCF_CHECK_LEVEL:=0} else : ${OCF_CHECK_LEVEL:=$OCF_RESKEY_OCF_CHECK_LEVEL} fi if [ ! -d "$OCF_ROOT" ]; then ha_log "ERROR: OCF_ROOT points to non-directory $OCF_ROOT." exit $OCF_ERR_GENERIC fi if [ -z "$OCF_RESOURCE_TYPE" ]; then : ${OCF_RESOURCE_TYPE:=$__SCRIPT_NAME} fi + if [ "x$__OCF_ACTION" = "xmeta-data" ]; then + : ${OCF_RESOURCE_INSTANCE:="RESOURCE_ID"} + fi + if [ -z "$OCF_RA_VERSION_MAJOR" ]; then : We are being invoked as an init script. : Fill in some things with reasonable values. : ${OCF_RESOURCE_INSTANCE:="default"} return 0 fi - if [ "x$__OCF_ACTION" = "xmeta-data" ]; then - OCF_RESOURCE_INSTANCE="undef" - fi - if [ -z "$OCF_RESOURCE_INSTANCE" ]; then ha_log "ERROR: Need to tell us our resource instance name." exit $OCF_ERR_ARGS fi } hadate() { date "+${HA_DATEFMT}" } set_logtag() { if [ -z "$HA_LOGTAG" ]; then if [ -n "$OCF_RESOURCE_INSTANCE" ]; then HA_LOGTAG="$__SCRIPT_NAME($OCF_RESOURCE_INSTANCE)[$$]" else HA_LOGTAG="$__SCRIPT_NAME[$$]" fi fi } __ha_log() { local ignore_stderr=false local loglevel [ "x$1" = "x--ignore-stderr" ] && ignore_stderr=true && shift [ none = "$HA_LOGFACILITY" ] && HA_LOGFACILITY="" # if we're connected to a tty, then output to stderr if tty >/dev/null; then if [ "x$HA_debug" = "x0" -a "x$loglevel" = xdebug ] ; then return 0 elif [ "$ignore_stderr" = "true" ]; then # something already printed this error to stderr, so ignore return 0 fi if [ "$HA_LOGTAG" ]; then echo "$HA_LOGTAG: $*" else echo "$*" fi >&2 return 0 fi set_logtag if [ "x${HA_LOGD}" = "xyes" ] ; then ha_logger -t "${HA_LOGTAG}" "$@" if [ "$?" -eq "0" ] ; then return 0 fi fi if [ -n "$HA_LOGFACILITY" ] then : logging through syslog # loglevel is unknown, use 'notice' for now loglevel=notice case "${*}" in *ERROR*) loglevel=err;; *WARN*) loglevel=warning;; *INFO*|info) loglevel=info;; esac logger -t "$HA_LOGTAG" -p ${HA_LOGFACILITY}.${loglevel} "${*}" fi if [ -n "$HA_LOGFILE" ] then : appending to $HA_LOGFILE echo "$HA_LOGTAG: "`hadate`"${*}" >> $HA_LOGFILE fi if [ -z "$HA_LOGFACILITY" -a -z "$HA_LOGFILE" ] && ! [ "$ignore_stderr" = "true" ] then : appending to stderr echo `hadate`"${*}" >&2 fi if [ -n "$HA_DEBUGLOG" ] then : appending to $HA_DEBUGLOG if [ "$HA_LOGFILE"x != "$HA_DEBUGLOG"x ]; then echo "$HA_LOGTAG: "`hadate`"${*}" >> $HA_DEBUGLOG fi fi } ha_log() { __ha_log "$@" } ha_debug() { if [ "x${HA_debug}" = "x0" ] ; then return 0 fi if tty >/dev/null; then if [ "$HA_LOGTAG" ]; then echo "$HA_LOGTAG: $*" else echo "$*" fi >&2 return 0 fi set_logtag if [ "x${HA_LOGD}" = "xyes" ] ; then ha_logger -t "${HA_LOGTAG}" -D "ha-debug" "$@" if [ "$?" -eq "0" ] ; then return 0 fi fi [ none = "$HA_LOGFACILITY" ] && HA_LOGFACILITY="" if [ -n "$HA_LOGFACILITY" ] then : logging through syslog logger -t "$HA_LOGTAG" -p "${HA_LOGFACILITY}.debug" "${*}" fi if [ -n "$HA_DEBUGLOG" ] then : appending to $HA_DEBUGLOG echo "$HA_LOGTAG: "`hadate`"${*}" >> $HA_DEBUGLOG fi if [ -z "$HA_LOGFACILITY" -a -z "$HA_DEBUGLOG" ] then : appending to stderr echo "$HA_LOGTAG: `hadate`${*}: ${HA_LOGFACILITY}" >&2 fi } ha_parameter() { local VALUE VALUE=`sed -e 's%[ ][ ]*% %' -e 's%^ %%' -e 's%#.*%%' $HA_CF | grep -i "^$1 " | sed 's%[^ ]* %%'` if [ "X$VALUE" = X ] then case $1 in keepalive) VALUE=2;; deadtime) ka=`ha_parameter keepalive` VALUE=`expr $ka '*' 2 '+' 1`;; esac fi echo $VALUE } ocf_log() { # TODO: Revisit and implement internally. if [ $# -lt 2 ] then ocf_log err "Not enough arguments [$#] to ocf_log." fi __OCF_PRIO="$1" shift __OCF_MSG="$*" case "${__OCF_PRIO}" in crit) __OCF_PRIO="CRIT";; err) __OCF_PRIO="ERROR";; warn) __OCF_PRIO="WARNING";; info) __OCF_PRIO="INFO";; debug)__OCF_PRIO="DEBUG";; *) __OCF_PRIO=`echo ${__OCF_PRIO}| tr '[a-z]' '[A-Z]'`;; esac if [ "${__OCF_PRIO}" = "DEBUG" ]; then ha_debug "${__OCF_PRIO}: $__OCF_MSG" else ha_log "${__OCF_PRIO}: $__OCF_MSG" fi } # # ocf_exit_reason: print exit error string to stderr # Usage: Allows the OCF script to provide a string # describing why the exit code was returned. # Arguments: reason - required, The string that represents why the error # occured. # ocf_exit_reason() { local cookie="$OCF_EXIT_REASON_PREFIX" local fmt local msg # No argument is likely not intentional. # Just one argument implies a printf format string of just "%s". # "Least surprise" in case some interpolated string from variable # expansion or other contains a percent sign. # More than one argument: first argument is going to be the format string. case $# in 0) ocf_log err "Not enough arguments to ocf_log_exit_msg." ;; 1) fmt="%s" ;; *) fmt=$1 shift case $fmt in *%*) : ;; # ok, does look like a format string *) ocf_log warn "Does not look like format string: [$fmt]" ;; esac ;; esac if [ -z "$cookie" ]; then # use a default prefix cookie="ocf-exit-reason:" fi msg=$(printf "${fmt}" "$@") printf >&2 "%s%s\n" "$cookie" "$msg" __ha_log --ignore-stderr "ERROR: $msg" } # # ocf_deprecated: Log a deprecation warning # Usage: ocf_deprecated [param-name] # Arguments: param-name optional, name of a boolean resource # parameter that can be used to suppress # the warning (default # "ignore_deprecation") ocf_deprecated() { local param param=${1:-ignore_deprecation} # don't use ${!param} here, it's a bashism if ! ocf_is_true $(eval echo \$OCF_RESKEY_$param); then ocf_log warn "This resource agent is deprecated" \ "and may be removed in a future release." \ "See the man page for details." \ "To suppress this warning, set the \"${param}\"" \ "resource parameter to true." fi } # # Ocf_run: Run a script, and log its output. # Usage: ocf_run [-q] [-info|-warn|-err] # -q: don't log the output of the command if it succeeds # -info|-warn|-err: log the output of the command at given # severity if it fails (defaults to err) # ocf_run() { local rc local output local verbose=1 local loglevel=err local var for var in 1 2 do case "$1" in "-q") verbose="" shift 1;; "-info"|"-warn"|"-err") loglevel=`echo $1 | sed -e s/-//g` shift 1;; *) ;; esac done output=`"$@" 2>&1` rc=$? output=`echo $output` if [ $rc -eq 0 ]; then if [ "$verbose" -a ! -z "$output" ]; then ocf_log info "$output" fi return $OCF_SUCCESS else if [ ! -z "$output" ]; then ocf_log $loglevel "$output" else ocf_log $loglevel "command failed: $*" fi return $rc fi } ocf_pidfile_status() { local pid pidfile=$1 if [ ! -e $pidfile ]; then # Not exists return 2 fi pid=`cat $pidfile` kill -0 $pid 2>&1 > /dev/null if [ $? = 0 ]; then return 0 fi # Stale return 1 } ocf_take_lock() { local lockfile=$1 local rnd=$(ocf_maybe_random) sleep 0.$rnd while ocf_pidfile_status $lockfile do ocf_log info "Sleeping until $lockfile is released..." sleep 0.$rnd done echo $$ > $lockfile } ocf_release_lock_on_exit() { local lockfile=$1 trap "rm -f $lockfile" EXIT } # returns true if the CRM is currently running a probe. A probe is # defined as a monitor operation with a monitoring interval of zero. ocf_is_probe() { [ "$__OCF_ACTION" = "monitor" -a "$OCF_RESKEY_CRM_meta_interval" = 0 ] } # returns true if the resource is configured as a clone. This is # defined as a resource where the clone-max meta attribute is present, # and set to greater than zero. ocf_is_clone() { [ ! -z "${OCF_RESKEY_CRM_meta_clone_max}" ] && [ "${OCF_RESKEY_CRM_meta_clone_max}" -gt 0 ] } # returns true if the resource is configured as a multistate # (master/slave) resource. This is defined as a resource where the # master-max meta attribute is present, and set to greater than zero. ocf_is_ms() { [ ! -z "${OCF_RESKEY_CRM_meta_master_max}" ] && [ "${OCF_RESKEY_CRM_meta_master_max}" -gt 0 ] } # version check functions # allow . and - to delimit version numbers # max version number is 999 # letters and such are effectively ignored # ocf_is_ver() { echo $1 | grep '^[0-9][0-9.-]*[0-9]$' >/dev/null 2>&1 } ocf_ver2num() { echo $1 | awk -F'[.-]' ' {for(i=1; i<=NF; i++) s=s*1000+$i; print s} ' } ocf_ver_level(){ echo $1 | awk -F'[.-]' '{print NF}' } ocf_ver_complete_level(){ local ver="$1" local level="$2" local i=0 while [ $i -lt $level ]; do ver=${ver}.0 i=`expr $i + 1` done echo $ver } # usage: ocf_version_cmp VER1 VER2 # version strings can contain digits, dots, and dashes # must start and end with a digit # returns: # 0: VER1 smaller (older) than VER2 # 1: versions equal # 2: VER1 greater (newer) than VER2 # 3: bad format ocf_version_cmp() { ocf_is_ver "$1" || return 3 ocf_is_ver "$2" || return 3 local v1=$1 local v2=$2 local v1_level=`ocf_ver_level $v1` local v2_level=`ocf_ver_level $v2` local level_diff if [ $v1_level -lt $v2_level ]; then level_diff=`expr $v2_level - $v1_level` v1=`ocf_ver_complete_level $v1 $level_diff` elif [ $v1_level -gt $v2_level ]; then level_diff=`expr $v1_level - $v2_level` v2=`ocf_ver_complete_level $v2 $level_diff` fi v1=`ocf_ver2num $v1` v2=`ocf_ver2num $v2` if [ $v1 -eq $v2 ]; then return 1 elif [ $v1 -lt $v2 ]; then return 0 else return 2 # -1 would look funny in shell ;-) fi } ocf_local_nodename() { # use crm_node -n for pacemaker > 1.1.8 which pacemakerd > /dev/null 2>&1 if [ $? -eq 0 ]; then local version=$(pacemakerd -$ | grep "Pacemaker .*" | awk '{ print $2 }') version=$(echo $version | awk -F- '{ print $1 }') ocf_version_cmp "$version" "1.1.8" if [ $? -eq 2 ]; then which crm_node > /dev/null 2>&1 if [ $? -eq 0 ]; then crm_node -n return fi fi fi # otherwise use uname -n uname -n } # usage: dirname DIR dirname() { local a local b [ $# = 1 ] || return 1 a="$1" while [ 1 ]; do b="${a%/}" [ "$a" = "$b" ] && break a="$b" done b=${a%/*} [ -z "$b" -o "$a" = "$b" ] && b="." echo "$b" return 0 } # # pseudo_resource status tracking function... # # This allows pseudo resources to give correct status information. As we add # resource monitoring, and better resource tracking in general, this will # become essential. # -# These scripts work because ${HA_RSCTMP} is cleaned out every time -# heartbeat is started. +# These scripts work because ${HA_RSCTMP} is cleaned on node reboot. # # We create "resource-string" tracking files under ${HA_RSCTMP} in a # very simple way: # # Existence of "${HA_RSCTMP}/resource-string" means that we consider # the resource named by "resource-string" to be running. # # Note that "resource-string" needs to be unique. Using the resource type # plus the resource instance arguments to make up the resource string # is probably sufficient... # # usage: ha_pseudo_resource resource-string op [tracking_file] # where op is {start|stop|monitor|status|restart|reload|print} # print is a special op which just prints the tracking file location # user can override our choice of the tracking file location by # specifying it as the third arg # Note that all operations are silent... # ha_pseudo_resource() { local ha_resource_tracking_file="${3:-${HA_RSCTMP}/$1}" case $2 in start|restart|reload) touch "$ha_resource_tracking_file";; stop) rm -f "$ha_resource_tracking_file";; status|monitor) if [ -f "$ha_resource_tracking_file" ] then return 0 else case $2 in status) return 3;; *) return 7;; esac fi;; print) echo "$ha_resource_tracking_file";; *) return 3;; esac } # usage: rmtempdir TMPDIR rmtempdir() { [ $# = 1 ] || return 1 if [ -e "$1" ]; then rmdir "$1" || return 1 fi return 0 } # usage: maketempfile [-d] maketempfile() { if [ $# = 1 -a "$1" = "-d" ]; then mktemp -d return -0 elif [ $# != 0 ]; then return 1 fi mktemp return 0 } # usage: rmtempfile TMPFILE rmtempfile () { [ $# = 1 ] || return 1 if [ -e "$1" ]; then rm "$1" || return 1 fi return 0 } # echo the first lower supported check level # pass set of levels supported by the agent # (in increasing order, 0 is optional) ocf_check_level() { local lvl prev lvl=0 prev=0 if ocf_is_decimal "$OCF_CHECK_LEVEL"; then # the level list should be very short for lvl; do if [ "$lvl" -eq "$OCF_CHECK_LEVEL" ]; then break elif [ "$lvl" -gt "$OCF_CHECK_LEVEL" ]; then lvl=$prev # the previous one break fi prev=$lvl done fi echo $lvl } # usage: ocf_stop_processes SIGNALS WAIT_TIME PIDS # # we send signals (use quotes for more than one!) in the order # given; if one or more processes are still running we try KILL; # the wait_time is the _total_ time we'll spend in this function # this time may be slightly exceeded if the processes won't leave # # returns: # 0: all processes left # 1: some processes still running # # example: # # ocf_stop_processes TERM 5 $pids # ocf_stop_processes() { local signals="$1" local wait_time="$(($2/`echo $signals|wc -w`))" shift 2 local pids="$*" local sig i test -z "$pids" && return 0 for sig in $signals KILL; do kill -s $sig $pids 2>/dev/null # try to leave early, and yet leave processes time to exit sleep 0.2 for i in `seq $wait_time`; do kill -s 0 $pids 2>/dev/null || return 0 sleep 1 done done return 1 } # # create a given status directory # if the directory path doesn't start with $HA_VARRUN, then # we return with error (most of the calls would be with the user # supplied configuration, hence we need to do necessary # protection) # used mostly for PID files # # usage: ocf_mkstatedir owner permissions path # # owner: user.group # permissions: permissions # path: directory path # # example: # ocf_mkstatedir named 755 `dirname $pidfile` # ocf_mkstatedir() { local owner local perms local path owner=$1 perms=$2 path=$3 test -d $path && return 0 [ $(id -u) = 0 ] || return 1 case $path in $HA_VARRUN/*) : this path is ok ;; *) ocf_log err "cannot create $path (does not start with $HA_VARRUN)" return 1 ;; esac mkdir -p $path && chown $owner $path && chmod $perms $path } # # create a unique status directory in $HA_VARRUN # used mostly for PID files # the directory is by default set to # $HA_VARRUN/$OCF_RESOURCE_INSTANCE # the directory name is printed to stdout # # usage: ocf_unique_rundir owner permissions name # # owner: user.group (default: "root") # permissions: permissions (default: "755") # name: some unique string (default: "$OCF_RESOURCE_INSTANCE") # # to use the default either don't set the parameter or set it to # empty string ("") # example: # # STATEDIR=`ocf_unique_rundir named "" myownstatedir` # ocf_unique_rundir() { local path local owner local perms local name owner=${1:-"root"} perms=${2:-"755"} name=${3:-"$OCF_RESOURCE_INSTANCE"} path=$HA_VARRUN/$name if [ ! -d $path ]; then [ $(id -u) = 0 ] || return 1 mkdir -p $path && chown $owner $path && chmod $perms $path || return 1 fi echo $path } # # RA tracing may be turned on by setting OCF_TRACE_RA # the trace output will be saved to OCF_TRACE_FILE, if set, or # by default to # $HA_VARLIB/trace_ra//.. # e.g. $HA_VARLIB/trace_ra/oracle/db.start.2012-11-27.08:37:08 # # OCF_TRACE_FILE: # - FD (small integer [3-9]) in that case it is up to the callers # to capture output; the FD _must_ be open for writing # - absolute path # # NB: FD 9 may be used for tracing with bash >= v4 in case # OCF_TRACE_FILE is set to a path. # ocf_is_bash4() { echo "$SHELL" | grep bash > /dev/null && [ ${BASH_VERSINFO[0]} = "4" ] } ocf_trace_redirect_to_file() { local dest=$1 if ocf_is_bash4; then exec 9>$dest BASH_XTRACEFD=9 else exec 2>$dest fi } ocf_trace_redirect_to_fd() { local fd=$1 if ocf_is_bash4; then BASH_XTRACEFD=$fd else exec 2>&$fd fi } __ocf_test_trc_dest() { local dest=$1 if ! touch $dest; then ocf_log warn "$dest not writable, trace not going to happen" __OCF_TRC_DEST="" __OCF_TRC_MANAGE="" return 1 fi return 0 } ocf_default_trace_dest() { tty >/dev/null && return if [ -n "$OCF_RESOURCE_TYPE" -a \ -n "$OCF_RESOURCE_INSTANCE" -a -n "$__OCF_ACTION" ]; then local ts=`date +%F.%T` __OCF_TRC_DEST=$HA_VARLIB/trace_ra/${OCF_RESOURCE_TYPE}/${OCF_RESOURCE_INSTANCE}.${__OCF_ACTION}.$ts __OCF_TRC_MANAGE="1" fi } ocf_start_trace() { export __OCF_TRC_DEST="" __OCF_TRC_MANAGE="" case "$OCF_TRACE_FILE" in [3-9]) ocf_trace_redirect_to_fd "$OCF_TRACE_FILE" ;; /*/*) __OCF_TRC_DEST=$OCF_TRACE_FILE ;; "") ocf_default_trace_dest ;; *) ocf_log warn "OCF_TRACE_FILE must be set to either FD (open for writing) or absolute file path" ocf_default_trace_dest ;; esac if [ "$__OCF_TRC_DEST" ]; then mkdir -p `dirname $__OCF_TRC_DEST` __ocf_test_trc_dest $__OCF_TRC_DEST || return ocf_trace_redirect_to_file "$__OCF_TRC_DEST" fi if [ -n "$BASH_VERSION" ]; then PS4='+ `date +"%T"`: ${FUNCNAME[0]:+${FUNCNAME[0]}:}${LINENO}: ' fi set -x env=$( echo; printenv | sort ) } ocf_stop_trace() { set +x } __ocf_set_defaults "$@" : ${OCF_TRACE_RA:=$OCF_RESKEY_trace_ra} ocf_is_true "$OCF_TRACE_RA" && ocf_start_trace # pacemaker sets HA_use_logd, some others use HA_LOGD :/ if ocf_is_true "$HA_use_logd"; then : ${HA_LOGD:=yes} fi diff --git a/heartbeat/oracle b/heartbeat/oracle index 951221c5e..6fad5bc6f 100755 --- a/heartbeat/oracle +++ b/heartbeat/oracle @@ -1,763 +1,770 @@ #!/bin/sh # # # oracle # # Description: Manages an Oracle Database as a High-Availability # resource # # # Author: Dejan Muhamedagic # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # Copyright: (C) 2006 International Business Machines, Inc. # # This code inspired by the DB2 resource script # written by Alan Robertson # # An example usage in /etc/ha.d/haresources: # node1 10.0.0.170 oracle::RK1::/oracle/10.2::orark1 # # See oracle_usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_sid # OCF_RESKEY_home (optional; else read it from /etc/oratab) # OCF_RESKEY_user (optional; figure it out by checking file ownership) # OCF_RESKEY_ipcrm (optional; defaults to "instance") # OCF_RESKEY_clear_backupmode (optional; default to "false") # OCF_RESKEY_shutdown_method (optional; default to "checkpoint/abort") # OCF_RESKEY_monuser (optional; defaults to "OCFMON") # OCF_RESKEY_monpassword (optional; defaults to "OCFMON") # OCF_RESKEY_monprofile (optional; defaults to "OCFMONPROFILE") # # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs . ${OCF_FUNCTIONS_DIR}/ora-common.sh ####################################################################### oracle_usage() { methods=`oracle_methods` methods=`echo $methods | tr ' ' '|'` cat <<-! usage: $0 {$methods} $0 manages an Oracle Database instance as an HA resource. The 'start' operation starts the database. The 'stop' operation stops the database. The 'status' operation reports whether the database is running The 'monitor' operation reports whether the database seems to be working The 'dumpinstipc' operation prints IPC resources used by the instance The 'cleanup' operation tries to clean up after Oracle was brutally stopped The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports ! } # Defaults OCF_RESKEY_monuser_default="OCFMON" OCF_RESKEY_monpassword_default="OCFMON" OCF_RESKEY_monprofile_default="OCFMONPROFILE" oracle_meta_data() { cat < 1.0 Resource script for oracle. Manages an Oracle Database instance as an HA resource. Manages an Oracle Database instance The Oracle SID (aka ORACLE_SID). sid The Oracle home directory (aka ORACLE_HOME). If not specified, then the SID along with its home should be listed in /etc/oratab. home The Oracle owner (aka ORACLE_OWNER). If not specified, then it is set to the owner of file \$ORACLE_HOME/dbs/*\${ORACLE_SID}.ora. If this does not work for you, just set it explicitely. user Monitoring user name. Every connection as sysdba is logged in an audit log. This can result in a large number of new files created. A new user is created (if it doesn't exist) in the start action and subsequently used in monitor. It should have very limited rights. Make sure that the password for this user does not expire. monuser Password for the monitoring user. Make sure that the password for this user does not expire. monpassword Profile used by the monitoring user. If the profile does not exist, it will be created with a non-expiring password. monprofile Sometimes IPC objects (shared memory segments and semaphores) belonging to an Oracle instance might be left behind which prevents the instance from starting. It is not easy to figure out which shared segments belong to which instance, in particular when more instances are running as same user. What we use here is the "oradebug" feature and its "ipc" trace utility. It is not optimal to parse the debugging information, but I am not aware of any other way to find out about the IPC information. In case the format or wording of the trace report changes, parsing might fail. There are some precautions, however, to prevent stepping on other peoples toes. There is also a dumpinstipc option which will make us print the IPC objects which belong to the instance. Use it to see if we parse the trace file correctly. Three settings are possible: - none: don't mess with IPC and hope for the best (beware: you'll probably be out of luck, sooner or later) - instance: try to figure out the IPC stuff which belongs to the instance and remove only those (default; should be safe) - orauser: remove all IPC belonging to the user which runs the instance (don't use this if you run more than one instance as same user or if other apps running as this user use IPC) The default setting "instance" should be safe to use, but in that case we cannot guarantee that the instance will start. In case IPC objects were already left around, because, for instance, someone mercilessly killing Oracle processes, there is no way any more to find out which IPC objects should be removed. In that case, human intervention is necessary, and probably _all_ instances running as same user will have to be stopped. The third setting, "orauser", guarantees IPC objects removal, but it does that based only on IPC objects ownership, so you should use that only if every instance runs as separate user. Please report any problems. Suggestions/fixes welcome. ipcrm The clear of the backup mode of ORACLE. clear_backupmode How to stop Oracle is a matter of taste it seems. The default method ("checkpoint/abort") is: alter system checkpoint; shutdown abort; This should be the fastest safe way bring the instance down. If you find "shutdown abort" distasteful, set this attribute to "immediate" in which case we will shutdown immediate; If you still think that there's even better way to shutdown an Oracle instance we are willing to listen. shutdown_method END } # # methods: What methods/operations do we support? # oracle_methods() { cat <<-! start stop status monitor dumpinstipc showdbstat cleanup validate-all methods meta-data usage ! } # # Run commands as the Oracle owner... # execsql() { if [ "$US" = "$ORACLE_OWNER" ]; then sqlplus -S /nolog else su - $ORACLE_OWNER -s /bin/sh -c ". $ORA_ENVF; sqlplus -S /nolog" fi } # # Run commands in the oracle admin sqlplus... # common_sql_opts() { cat</dev/null; then return 0 fi output=`dbasql mk_mon_profile show_mon_profile` if echo "$output" | grep -iw "^$MONPROFILE" >/dev/null; then return 0 + elif echo "$output" | grep ORA-65140 >/dev/null 2>&1; then + ocf_exit_reason "monprofile must start with C## for container databases" + return $OCF_ERR_CONFIGURED else ocf_exit_reason "could not create $MONPROFILE oracle profile" ocf_log err "sqlplus output: $output" return 1 fi } check_mon_user() { local output local output2 output=`dbasql show_mon_user` if echo "$output" | grep -iw "^$MONUSR" >/dev/null; then if echo "$output" | grep -w "EXPIRED" >/dev/null; then dbasql reset_mon_user_password fi output=`dbasql show_mon_user_profile` if echo "$output" | grep -iw "^$MONPROFILE" >/dev/null; then return 0 else output=`dbasql set_mon_user_profile` output2=`dbasql show_mon_user_profile` if echo "$output2" | grep -iw "^$MONPROFILE" >/dev/null; then return 0 fi ocf_exit_reason "could not set profile for $MONUSR oracle user" ocf_log err "sqlplus output: $output( $output2 )" return 1 fi fi output=`dbasql mk_mon_user show_mon_user` if echo "$output" | grep -iw "^$MONUSR" >/dev/null; then return 0 + elif echo "$output" | grep ORA-65096 >/dev/null 2>&1; then + ocf_exit_reason "monuser must start with C## for container databases" + return $OCF_ERR_CONFIGURED else ocf_exit_reason "could not create $MONUSR oracle user" ocf_log err "sqlplus output: $output" return 1 fi } # # print the output of dbstat (for debugging) # showdbstat() { echo "Full output:" dbstat | execsql echo "Stripped output:" echo "<`dbasql dbstat`>" } # # IPC stuff: not overly complex, but quite involved :-/ # # Part 1: Oracle other_trace_junk() { echo $1 | sed 's/trc$/trm/' } dumpinstipc() { local output tracef output=`dbasql getipc` # filename in the 2nd line tracef=`echo "$output" | awk 'NR==2' | grep '^/.*trc$'` if [ "$tracef" ]; then echo $tracef else ocf_log warn "'dbasql getipc' failed: $output" return 1 fi } parseipc() { local inf=$1 if [ ! -f "$1" ]; then ocf_log warn "$1: no such ipc trace file" return 1 fi awk ' $3 == "Shmid" {n=1;next} n { if( $3~/^[0-9]+$/ ) print $3; n=0 } ' $inf | sort -u | sed 's/^/m:/' awk ' /Semaphore List/ {insems=1;next} insems { for( i=1; i<=NF; i++ ) if( $i~/^[0-9]+$/ ) print $i; } /system semaphore information/ {exit} ' $inf | sort -u | sed 's/^/s:/' TMPFILES="$TMPFILES $inf `other_trace_junk $inf`" } # Part 2: OS (ipcs,ipcrm) filteroraipc() { # this portable? grep -w $ORACLE_OWNER | awk '{print $2}' } ipcdesc() { local what=$1 case $what in m) echo "shared memory segment";; s) echo "semaphore";; q) echo "message queue";; esac } rmipc() { local what=$1 id=$2 ipcs -$what | filteroraipc | grep -iw $id >/dev/null 2>&1 || return ocf_log info "Removing `ipcdesc $what` $id." ipcrm -$what $id } ipcrm_orauser() { local what id for what in m s q; do for id in `ipcs -$what | filteroraipc`; do rmipc $what $id done done } ipcrm_instance() { local ipcobj for ipcobj; do rmipc `echo $ipcobj | sed 's/:/ /'` done } # # oracle_status: is the Oracle instance running? # # quick check to see if the instance is up is_proc_running() { ps -ef | grep -wiqs "[^ ]*[_]pmon_${ORACLE_SID}" } # instance in OPEN state? instance_live() { local status=`monsql_one dbstat` [ "$status" = OPEN ] && return 0 status=`dbasql_one dbstat` if [ "$status" = OPEN ]; then return 0 else ocf_log info "$ORACLE_SID instance state is not OPEN (dbstat output: $status)" return 1 fi } ora_cleanup() { #rm -fr /tmp/.oracle #??? rm -f `ls $ORACLE_HOME/dbs/lk* | grep -i "$ORACLE_SID\$"` #return case $IPCRM in none) ;; instance) ipcrm_instance $* ;; orauser) ipcrm_orauser $* ;; esac } oracle_getconfig() { ora_common_getconfig "$OCF_RESKEY_sid" "$OCF_RESKEY_home" "$OCF_RESKEY_user" clear_backupmode=${OCF_RESKEY_clear_backupmode:-"false"} shutdown_method=${OCF_RESKEY_shutdown_method:-"checkpoint/abort"} IPCRM=${OCF_RESKEY_ipcrm:-"instance"} } # # oracle_start: Start the Oracle instance # # NOTE: We handle instance in the MOUNTED and STARTED states # efficiently # We *do not* handle instance in the restricted or read-only # mode, i.e. it appears as running, but its availability is # "not for general use" # oracle_start() { local status output if is_proc_running; then status="`dbasql_one dbstat`" case "$status" in "OPEN") : nothing to be done, we can leave right now ocf_log info "Oracle instance $ORACLE_SID already running" return $OCF_SUCCESS ;; "STARTED") output=`dbasql dbmount` ;; "MOUNTED") : we proceed if mounted ;; *) # status unknown output=`dbasql dbstop dbstart_mount` ;; esac else output="`dbasql dbstart_mount`" # try to cleanup in case of # ORA-01081: cannot start already-running ORACLE - shut it down first if echo "$output" | grep ORA-01081 >/dev/null 2>&1; then ocf_log info "ORA-01081 error found, trying to cleanup oracle (dbstart_mount output: $output)" ora_cleanup + output=`dbasql dbstop_immediate` output=`dbasql dbstart_mount` fi fi # oracle instance should be mounted. status="`dbasql_one dbstat`" case "$status" in "MOUNTED") ;; *) : error!! ocf_exit_reason "oracle $ORACLE_SID can not be mounted (status: $status)" return $OCF_ERR_GENERIC ;; esac # It is examined whether mode is "online backup mode", # and if it is true, makes clear the mode. # Afterwards, DB is opened. if is_clear_backupmode_set && is_instance_in_backup_mode; then clear_backup_mode fi output=`dbasql dbopen` # check/create the monitor profile if ! check_mon_profile; then return $OCF_ERR_GENERIC fi # check/create the monitor user if ! check_mon_user; then return $OCF_ERR_GENERIC fi if ! is_proc_running; then ocf_exit_reason "oracle process not running: $output" return $OCF_ERR_GENERIC elif ! instance_live; then ocf_exit_reason "oracle instance $ORACLE_SID not started: $output" return $OCF_ERR_GENERIC else : cool, we are up and running ocf_log info "Oracle instance $ORACLE_SID started: $output" return $OCF_SUCCESS fi } # # oracle_stop: Stop the Oracle instance # oracle_stop() { local status output ipc="" if is_proc_running; then [ "$IPCRM" = "instance" ] && ipc=$(parseipc `dumpinstipc`) output=`dbasql dbstop` else ocf_log info "Oracle instance $ORACLE_SID already stopped" return $OCF_SUCCESS fi ocf_stop_processes TERM $PROCS_CLEANUP_TIME `proc_pids` # kill the procs if they hanged if is_proc_running; then ocf_exit_reason "Oracle instance $ORACLE_SID not stopped: $output" return $OCF_ERR_GENERIC else ocf_log info "Oracle instance $ORACLE_SID stopped: $output" sleep 1 # give em a chance to cleanup ocf_log info "Cleaning up for $ORACLE_SID" ora_cleanup "$ipc" return $OCF_SUCCESS fi } # # oracle_monitor: Can the Oracle instance do anything useful? # oracle_monitor() { if ! is_proc_running; then ocf_log info "oracle process not running" return $OCF_NOT_RUNNING fi if ! instance_live; then ocf_exit_reason "oracle instance $ORACLE_SID is down" return $OCF_ERR_GENERIC fi #ocf_log info "Oracle instance $ORACLE_SID is alive" return $OCF_SUCCESS } # other supported actions oracle_status() { if is_proc_running then echo Oracle instance $ORACLE_SID is running exit $OCF_SUCCESS else echo Oracle instance $ORACLE_SID is stopped exit $OCF_NOT_RUNNING fi } oracle_dumpinstipc() { is_proc_running && parseipc `dumpinstipc` } oracle_showdbstat() { showdbstat } oracle_cleanup() { if [ "$IPCRM" = "instance" ]; then ora_cleanup $(parseipc `dumpinstipc`) else ora_cleanup fi } oracle_validate_all() { case "${shutdown_method}" in "immediate") ;; "checkpoint/abort") ;; *) ocf_exit_reason "unsupported shutdown_method, please read meta-data" return $OCF_ERR_CONFIGURED ;; esac case "${IPCRM}" in "none"|"instance"|"orauser") ;; *) ocf_exit_reason "unsupported ipcrm setting, please read meta-data" return $OCF_ERR_CONFIGURED ;; esac ora_common_validate_all } # used in ora-common.sh show_procs() { ps -e -o pid,args | grep -i "[o]ra[a-zA-Z0-9_]*$ORACLE_SID$" } proc_pids() { show_procs | awk '{print $1}'; } PROCS_CLEANUP_TIME="30" MONUSR=${OCF_RESKEY_monuser:-$OCF_RESKEY_monuser_default} MONPWD=${OCF_RESKEY_monpassword:-$OCF_RESKEY_monpassword_default} -MONPROFILE=${OCF_RESKEY_monprofile_default:-$OCF_RESKEY_monprofile_default} +MONPROFILE=${OCF_RESKEY_monprofile:-$OCF_RESKEY_monprofile_default} -MONUSR=$(echo $MONUSR | awk '{print toupper($0)}') -MONPROFILE=$(echo $MONPROFILE | awk '{print toupper($0)}') +MONUSR=$(echo "$MONUSR" | awk '{print toupper($0)}') +MONPROFILE=$(echo "$MONPROFILE" | awk '{print toupper($0)}') OCF_REQUIRED_PARAMS="sid" OCF_REQUIRED_BINARIES="sqlplus" ocf_rarun $* # # vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/oralsnr b/heartbeat/oralsnr index c47f12117..622138c6f 100755 --- a/heartbeat/oralsnr +++ b/heartbeat/oralsnr @@ -1,281 +1,281 @@ #!/bin/sh # # # oralsnr # # Description: Manages an Oracle Listener as a High-Availability # resource # # # Author: Dejan Muhamedagic # Support: linux-ha@lists.linux-ha.org # License: GNU General Public License (GPL) # Copyright: (C) 2006 International Business Machines, Inc. # # This code inspired by the DB2 resource script # written by Alan Robertson # # An example usage in /etc/ha.d/haresources: # node1 10.0.0.170 oralsnr::sid::home::user::listener # # See oralsnr_usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_sid (mandatory; for the monitor op) # OCF_RESKEY_home (optional; else read it from /etc/oratab) # OCF_RESKEY_user (optional; user to run the listener) # OCF_RESKEY_listener (optional; defaults to LISTENER) # # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs . ${OCF_FUNCTIONS_DIR}/ora-common.sh ####################################################################### SH=/bin/sh oralsnr_usage() { methods=`oralsnr_methods` methods=`echo $methods | tr ' ' '|'` cat <<-! usage: $0 ($methods) $0 manages an Oracle Database instance as an HA resource. The 'start' operation starts the database. The 'stop' operation stops the database. The 'status' operation reports whether the database is running The 'monitor' operation reports whether the database seems to be working The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports ! } oralsnr_meta_data() { cat < 1.0 Resource script for Oracle Listener. It manages an Oracle Listener instance as an HA resource. Manages an Oracle TNS listener The Oracle SID (aka ORACLE_SID). Necessary for the monitor op, i.e. to do tnsping SID. sid The Oracle home directory (aka ORACLE_HOME). If not specified, then the SID should be listed in /etc/oratab. home Run the listener as this user. user Listener instance to be started (as defined in listener.ora). Defaults to LISTENER. listener Full path to the directory that contains the Oracle listener tnsnames.ora configuration file. The shell variable TNS_ADMIN is set to the value provided. Full path to the directory containing tnsnames.ora END } # # methods: What methods/operations do we support? # oralsnr_methods() { cat <<-! start stop status monitor validate-all methods meta-data usage ! } # # Run commands as the Oracle owner... # runasdba() { if [ "$US" = "$ORACLE_OWNER" ]; then $SH else ( echo ". $ORA_ENVF" cat ) | su -s $SH - $ORACLE_OWNER fi } # # oralsnr_start: Start the Oracle listener instance # oralsnr_start() { if is_proc_running && test_tnsping; then : nothing to be done, we can leave right now ocf_log info "Listener $listener already running" return $OCF_SUCCESS fi output=`echo lsnrctl start $listener | runasdba` if test_tnsping; then : cool, we are up and running ocf_log info "Listener $listener running: $output" return $OCF_SUCCESS else ocf_exit_reason "Listener $listener appears to have started, but is not running properly: $output" ocf_log err "Probable Oracle configuration error" return $OCF_ERR_GENERIC fi } # # oralsnr_stop: Stop the Oracle instance # oralsnr_stop() { if is_proc_running; then output=`echo lsnrctl stop $listener | runasdba` else ocf_log info "Listener $listener already stopped" return $OCF_SUCCESS fi ocf_stop_processes TERM $PROCS_CLEANUP_TIME `proc_pids` # kill the procs if they hanged if is_proc_running; then ocf_exit_reason "Listener $listener not stopped: $output" return $OCF_ERR_GENERIC else ocf_log info "Listener $listener stopped: $output" return $OCF_SUCCESS fi } # # is_proc_running: is the listener running? # is_proc_running() { show_procs | grep "." > /dev/null } # the following two should be run only if the process is running test_listener() { local output output=`lsnrctl status $listener` if echo "$output" | tail -1 | grep -qs 'completed successfully' then return $OCF_SUCCESS else ocf_exit_reason "$listener status failed: $output" return $OCF_ERR_GENERIC fi } # and does it work? test_tnsping() { local output output=`tnsping $ORACLE_SID` if echo "$output" | tail -1 | grep -qs '^OK'; then return $OCF_SUCCESS else ocf_exit_reason "tnsping $ORACLE_SID failed: $output" return $OCF_ERR_GENERIC fi } # # oralsnr_monitor: Can we connect to the listener? # oralsnr_monitor() { if is_proc_running; then test_listener && test_tnsping else return $OCF_NOT_RUNNING fi } oralsnr_status() { if is_proc_running then echo Listener $listener is running exit $OCF_SUCCESS else echo Listener $listener is stopped exit $OCF_NOT_RUNNING fi } oralsnr_getconfig() { ora_common_getconfig "$OCF_RESKEY_sid" "$OCF_RESKEY_home" "$OCF_RESKEY_user" "$OCF_RESKEY_tns_admin" listener=${OCF_RESKEY_listener:-"LISTENER"} } oralsnr_validate_all() { ora_common_validate_all } # used in ora-common.sh show_procs() { - ps -e -o pid,user,args | - grep '[t]nslsnr' | grep -i -w "$listener" | grep -w "$ORACLE_OWNER" + ps -U "$ORACLE_OWNER" -o pid,user,args | + grep '[t]nslsnr' | grep -i -w "$listener" } proc_pids() { show_procs | awk '{print $1}'; } PROCS_CLEANUP_TIME="10" OCF_REQUIRED_PARAMS="sid" OCF_REQUIRED_BINARIES="lsnrctl tnsping" ocf_rarun $* # # vim:tabstop=4:shiftwidth=4:textwidth=0:wrapmargin=0 diff --git a/heartbeat/pgagent b/heartbeat/pgagent new file mode 100644 index 000000000..58054a7c3 --- /dev/null +++ b/heartbeat/pgagent @@ -0,0 +1,139 @@ +#!/bin/sh +# +# High-Availability pgagent OCF resource agent +# +# Description: Starts/stops pgagent +# Author: Oleg Selin +# License: GNU General Public License (GPL) +# +# OCF parameters: +# OCF_RESKEY_connection_string +# OCF_RESKEY_user +# OCF_RESKEY_options +# +####################################################################### +# Initialization: +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +OCF_RESKEY_executable_default="`which pgagent`" +OCF_RESKEY_connection_string_default="user=postgres host=/var/run/postgresql" +OCF_RESKEY_user_default="postgres" +OCF_RESKEY_options_default="-r 1 -t 1" + +: ${OCF_RESKEY_executable="${OCF_RESKEY_executable_default}"} +: ${OCF_RESKEY_connection_string="${OCF_RESKEY_connection_string_default}"} +: ${OCF_RESKEY_user="${OCF_RESKEY_user_default}"} +: ${OCF_RESKEY_options="${OCF_RESKEY_options_default}"} + +pgagent_validate_all() { + check_binary pgagent + ocf_log debug "executable: '$OCF_RESKEY_executable'" + ocf_log debug "connection string: '$OCF_RESKEY_connection_string'" + ocf_log debug "user: '$OCF_RESKEY_user'" + ocf_log debug "options: '$OCF_RESKEY_options'" + if [ -z "$OCF_RESKEY_connection_string" ]; then + ocf_log err "Connection string is not configured!" + exit $OCF_ERR_CONFIGURED + fi + if [ -z "$OCF_RESKEY_user" ]; then + ocf_log err "User is not configured!" + exit $OCF_ERR_CONFIGURED + fi + getent passwd $OCF_RESKEY_user >/dev/null 2>&1 + if [ ! $? -eq 0 ]; then + ocf_log err "User $OCF_RESKEY_user doesn't exist"; + return $OCF_ERR_CONFIGURED; + fi + return $OCF_SUCCESS +} + +pgagent_start() { + pgagent_validate_all + nohup su - $OCF_RESKEY_user -c "'$OCF_RESKEY_executable' $OCF_RESKEY_options '$OCF_RESKEY_connection_string'" > /dev/null 2>&1 & + sleep 1 + if [ -n pgagent_monitor ]; then + return $OCF_SUCCESS + fi + return $OCF_ERR_GENERIC +} + +pgagent_stop() { + pgagent_validate_all + pid=`pgrep -f -x -U $OCF_RESKEY_user "$OCF_RESKEY_executable $OCF_RESKEY_options $OCF_RESKEY_connection_string"` + if [ -n "$pid" ]; then + ocf_run kill $pid || return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS +} + +pgagent_monitor() { + if [ -z "$OCF_RESKEY_executable" ]; then + return $OCF_ERR_INSTALLED + fi + ocf_run pgrep -f -x -U "$OCF_RESKEY_user" "$OCF_RESKEY_executable $OCF_RESKEY_options $OCF_RESKEY_connection_string" || return $OCF_NOT_RUNNING + return $OCF_SUCCESS +} + +meta_data() { + cat < + + +1.0 +This is a pgagent Resource Agent. +Controls pgagent + + +Connection string for pgagent. +pgagent connection string + + + +User to run pgagent as. +User to run pgagent + + + +Options for pgagent. +pgagent run options, see pgagent --help for details + + + + + + + + + + + +END +} + +pgagent_usage() { + cat < # and other Linux-HA contributors # License: GNU General Public License (GPL) # ############################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Use runuser if available for SELinux. if [ -x /sbin/runuser ]; then SU=runuser else SU=su fi # # Get PostgreSQL Configuration parameter # get_pgsql_param() { local param_name param_name=$1 perl_code="if (/^\s*$param_name[\s=]+\s*(.*)$/) { \$dir=\$1; \$dir =~ s/\s*\#.*//; \$dir =~ s/^'(\S*)'/\$1/; print \$dir;}" perl -ne "$perl_code" < $OCF_RESKEY_config } # Defaults OCF_RESKEY_pgctl_default=/usr/bin/pg_ctl OCF_RESKEY_psql_default=/usr/bin/psql OCF_RESKEY_pgdata_default=/var/lib/pgsql/data OCF_RESKEY_pgdba_default=postgres OCF_RESKEY_pghost_default="" OCF_RESKEY_pgport_default=5432 OCF_RESKEY_pglibs_default=/usr/lib OCF_RESKEY_start_opt_default="" OCF_RESKEY_ctl_opt_default="" OCF_RESKEY_pgdb_default=template1 OCF_RESKEY_logfile_default=/dev/null -OCF_RESKEY_stop_escalate_default=30 +OCF_RESKEY_stop_escalate_default=90 OCF_RESKEY_monitor_user_default="" OCF_RESKEY_monitor_password_default="" OCF_RESKEY_monitor_sql_default="select now();" OCF_RESKEY_check_wal_receiver_default="false" # Defaults for replication OCF_RESKEY_rep_mode_default=none OCF_RESKEY_node_list_default="" OCF_RESKEY_restore_command_default="" OCF_RESKEY_archive_cleanup_command_default="" OCF_RESKEY_recovery_end_command_default="" OCF_RESKEY_master_ip_default="" OCF_RESKEY_repuser_default="postgres" OCF_RESKEY_primary_conninfo_opt_default="" OCF_RESKEY_restart_on_promote_default="false" OCF_RESKEY_tmpdir_default="/var/lib/pgsql/tmp" OCF_RESKEY_xlog_check_count_default="3" OCF_RESKEY_crm_attr_timeout_default="5" -OCF_RESKEY_stop_escalate_in_slave_default=30 +OCF_RESKEY_stop_escalate_in_slave_default=90 OCF_RESKEY_replication_slot_name_default="" : ${OCF_RESKEY_pgctl=${OCF_RESKEY_pgctl_default}} : ${OCF_RESKEY_psql=${OCF_RESKEY_psql_default}} : ${OCF_RESKEY_pgdata=${OCF_RESKEY_pgdata_default}} : ${OCF_RESKEY_pgdba=${OCF_RESKEY_pgdba_default}} : ${OCF_RESKEY_pghost=${OCF_RESKEY_pghost_default}} : ${OCF_RESKEY_pgport=${OCF_RESKEY_pgport_default}} : ${OCF_RESKEY_pglibs=${OCF_RESKEY_pglibs_default}} : ${OCF_RESKEY_config=${OCF_RESKEY_pgdata}/postgresql.conf} : ${OCF_RESKEY_start_opt=${OCF_RESKEY_start_opt_default}} : ${OCF_RESKEY_ctl_opt=${OCF_RESKEY_ctl_opt_default}} : ${OCF_RESKEY_pgdb=${OCF_RESKEY_pgdb_default}} : ${OCF_RESKEY_logfile=${OCF_RESKEY_logfile_default}} : ${OCF_RESKEY_stop_escalate=${OCF_RESKEY_stop_escalate_default}} : ${OCF_RESKEY_monitor_user=${OCF_RESKEY_monitor_user_default}} : ${OCF_RESKEY_monitor_password=${OCF_RESKEY_monitor_password_default}} : ${OCF_RESKEY_monitor_sql=${OCF_RESKEY_monitor_sql_default}} : ${OCF_RESKEY_check_wal_receiver=${OCF_RESKEY_check_wal_receiver_default}} # for replication : ${OCF_RESKEY_rep_mode=${OCF_RESKEY_rep_mode_default}} : ${OCF_RESKEY_node_list=${OCF_RESKEY_node_list_default}} : ${OCF_RESKEY_restore_command=${OCF_RESKEY_restore_command_default}} : ${OCF_RESKEY_archive_cleanup_command=${OCF_RESKEY_archive_cleanup_command_default}} : ${OCF_RESKEY_recovery_end_command=${OCF_RESKEY_recovery_end_command_default}} : ${OCF_RESKEY_master_ip=${OCF_RESKEY_master_ip_default}} : ${OCF_RESKEY_repuser=${OCF_RESKEY_repuser_default}} : ${OCF_RESKEY_primary_conninfo_opt=${OCF_RESKEY_primary_conninfo_opt_default}} : ${OCF_RESKEY_restart_on_promote=${OCF_RESKEY_restart_on_promote_default}} : ${OCF_RESKEY_tmpdir=${OCF_RESKEY_tmpdir_default}} : ${OCF_RESKEY_xlog_check_count=${OCF_RESKEY_xlog_check_count_default}} : ${OCF_RESKEY_crm_attr_timeout=${OCF_RESKEY_crm_attr_timeout_default}} : ${OCF_RESKEY_stop_escalate_in_slave=${OCF_RESKEY_stop_escalate_in_slave_default}} : ${OCF_RESKEY_replication_slot_name=${OCF_RESKEY_replication_slot_name_default}} usage() { cat < 1.0 Resource script for PostgreSQL. It manages a PostgreSQL as an HA resource. Manages a PostgreSQL database instance Path to pg_ctl command. pgctl Start options (-o start_opt in pg_ctl). "-i -p 5432" for example. start_opt Additional pg_ctl options (-w, -W etc..). ctl_opt Path to psql command. psql Path to PostgreSQL data directory. pgdata User that owns PostgreSQL. pgdba Hostname/IP address where PostgreSQL is listening pghost Port where PostgreSQL is listening pgport Custom location of the Postgres libraries. If not set, the standard location will be used. pglibs PostgreSQL user that pgsql RA will user for monitor operations. If it's not set pgdba user will be used. monitor_user Password for monitor user. monitor_password SQL script that will be used for monitor operations. monitor_sql Path to the PostgreSQL configuration file for the instance. Configuration file Database that will be used for monitoring. pgdb Path to PostgreSQL server log output file. logfile Unix socket directory for PostgreSQL. If you use PostgreSQL 9.3 or higher and define unix_socket_directories in the postgresql.conf, then you must set socketdir to determine which directory is used for psql command. socketdir -Number of shutdown retries (using -m fast) before resorting to -m immediate +Number of seconds to wait for stop (using -m fast) before resorting to -m immediate stop escalation Replication mode may be set to "async" or "sync" or "slave". They require PostgreSQL 9.1 or later. Once set, "async" and "sync" require node_list, master_ip, and restore_command parameters,as well as configuring PostgreSQL for replication (in postgresql.conf and pg_hba.conf). "slave" means that RA only makes recovery.conf before starting to connect to primary which is running somewhere. It dosen't need master/slave setting. It requires master_ip restore_command parameters. rep_mode All node names. Please separate each node name with a space. This is required for replication. node list restore_command for recovery.conf. This is required for replication. restore_command archive_cleanup_command for recovery.conf. This is used for replication and is optional. archive_cleanup_command recovery_end_command for recovery.conf. This is used for replication and is optional. recovery_end_command Master's floating IP address to be connected from hot standby. This parameter is used for "primary_conninfo" in recovery.conf. This is required for replication. master ip User used to connect to the master server. This parameter is used for "primary_conninfo" in recovery.conf. This is required for replication. repuser primary_conninfo options of recovery.conf except host, port, user and application_name. This is optional for replication. primary_conninfo_opt If this is true, RA deletes recovery.conf and restarts PostgreSQL on promote to keep Timeline ID. It probably makes fail-over slower. It's recommended to set on-fail of promote up as fence. This is optional for replication. restart_on_promote Set this option when using replication slots. Can only use lower case letters, numbers and underscore for replication_slot_name. When the master node has 1 slave node,one replication slot would be created with the name "replication_slot_name". When the master node has 2 or more slave nodes,the replication slots would be created for each node, with the name adding the node name as postfix. For example, replication_slot_name is "sample" and 2 slaves which are "node1" and "node2" connect to their slots, the slots names are "sample_node1" and "sample_node2". If the node name contains a upper case letter, hyphen and dot, those characters will be converted to a lower case letter or an underscore. For example, Node-1.example.com to node_1_example_com. pgsql RA doesn't monitor and delete the repliation slot. When the slave node has been disconnected in failure or the like, execute one of the following manually. Otherwise it may eventually cause a disk full because the master node will continue to accumulate the unsent WAL. 1. recover and reconnect the slave node to the master node as soon as possible. 2. delete the slot on the master node by following psql command. $ select pg_drop_replication_slot('replication_slot_name'); replication_slot_name Path to temporary directory. This is optional for replication. tmpdir Number of checks of xlog on monitor before promote. This is optional for replication. xlog check count The timeout of crm_attribute forever update command. Default value is 5 seconds. This is optional for replication. The timeout of crm_attribute forever update command. -Number of shutdown retries (using -m fast) before resorting to -m immediate +Number of seconds to wait for stop (using -m fast) before resorting to -m immediate in slave state. This is optional for replication. stop escalation_in_slave If this is true, RA checks wal_receiver process on monitor and notifies its status using "(resource name)-receiver-status" attribute. It's useful for checking whether PostgreSQL (hot standby) connects to primary. The attribute shows status as "normal" or "normal (master)" or "ERROR". Note that if you configure PostgreSQL as master/slave resource, then wal receiver is not running in the master and the attribute shows status as "normal (master)" consistently because it is normal status. check_wal_receiver EOF } # # Run the given command in the Resource owner environment... # runasowner() { local quietrun="" local loglevel="-err" local var for var in 1 2 do case "$1" in "-q") quietrun="-q" shift 1;; "warn"|"err") loglevel="-$1" shift 1;; *) ;; esac done ocf_run $quietrun $loglevel $SU $OCF_RESKEY_pgdba -c "cd $OCF_RESKEY_pgdata; $*" } # # Shell escape # escape_string() { echo "$*" | sed -e "s/'/'\\\\''/g" } # # methods: What methods/operations do we support? # pgsql_methods() { cat </dev/null 2>&1" return $? fi # No PID file false } pgsql_wal_receiver_status() { local PID local receiver_parent_pids local pgsql_real_monitor_status=$1 PID=`head -n 1 $PIDFILE` receiver_parent_pids=`ps -ef | tr -s " " | grep "[w]al receiver process" | cut -d " " -f 3` if echo "$receiver_parent_pids" | grep -q -w "$PID" ; then attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "normal" -q return 0 fi if [ $pgsql_real_monitor_status -eq "$OCF_RUNNING_MASTER" ]; then attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "normal (master)" -q return 0 fi attrd_updater -n "$PGSQL_WAL_RECEIVER_STATUS_ATTR" -v "ERROR" -q ocf_log warn "wal receiver process is not running" return 1 } # # pgsql_real_monitor # pgsql_real_monitor() { local loglevel local rc local output # Set the log level of the error message loglevel=${1:-err} if ! pgsql_status then ocf_log info "PostgreSQL is down" return $OCF_NOT_RUNNING fi if is_replication; then #Check replication state output=`exec_sql "${CHECK_MS_SQL}"` rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc $loglevel "Can't get PostgreSQL recovery status." return $OCF_ERR_GENERIC fi case "$output" in f) ocf_log debug "PostgreSQL is running as a primary." if [ "$OCF_RESKEY_monitor_sql" = "$OCF_RESKEY_monitor_sql_default" ]; then return $OCF_RUNNING_MASTER fi ;; t) ocf_log debug "PostgreSQL is running as a hot standby." return $OCF_SUCCESS;; *) ocf_exit_reason "$CHECK_MS_SQL output is $output" return $OCF_ERR_GENERIC;; esac fi OCF_RESKEY_monitor_sql=`escape_string "$OCF_RESKEY_monitor_sql"` runasowner -q $loglevel "$OCF_RESKEY_psql $psql_options \ -c '$OCF_RESKEY_monitor_sql'" rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc $loglevel "PostgreSQL $OCF_RESKEY_pgdb isn't running." return $OCF_ERR_GENERIC fi if is_replication; then return $OCF_RUNNING_MASTER fi return $OCF_SUCCESS } pgsql_replication_monitor() { local rc rc=$1 if [ $rc -ne $OCF_SUCCESS -a $rc -ne "$OCF_RUNNING_MASTER" ]; then return $rc fi # If I am Master if [ $rc -eq $OCF_RUNNING_MASTER ]; then change_data_status "$NODENAME" "LATEST" change_pgsql_status "$NODENAME" "PRI" control_slave_status || return $OCF_ERR_GENERIC if [ "$RE_CONTROL_SLAVE" = "true" ]; then sleep 2 ocf_log info "re-controlling slave status." RE_CONTROL_SLAVE="none" control_slave_status || return $OCF_ERR_GENERIC fi return $rc fi # I can't get master node name from $OCF_RESKEY_CRM_meta_notify_master_uname on monitor, # so I will get master node name using crm_mon -n print_crm_mon | tr -d "\t" | tr -d " " | grep -q "^${RESOURCE_NAME}[(:].*[):].*Master" if [ $? -ne 0 ] ; then # If I am Slave and Master is not exist ocf_log info "Master does not exist." change_pgsql_status "$NODENAME" "HS:alone" have_master_right if [ $? -eq 0 ]; then rm -f ${XLOG_NOTE_FILE}.* fi else output=`exec_with_retry 0 $CRM_ATTR_FOREVER -N "$NODENAME" \ -n "$PGSQL_DATA_STATUS_ATTR" -G -q` if [ "$output" = "DISCONNECT" ]; then change_pgsql_status "$NODENAME" "HS:alone" fi fi return $rc } #pgsql_monitor: pgsql_real_monitor() wrapper for replication pgsql_monitor() { local rc pgsql_real_monitor rc=$? if ocf_is_true ${OCF_RESKEY_check_wal_receiver}; then pgsql_wal_receiver_status $rc fi if ! is_replication; then return $rc else pgsql_replication_monitor $rc return $? fi } # pgsql_post_demote pgsql_post_demote() { DEMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname | sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` ocf_log debug "post-demote called. Demote uname is $DEMOTE_NODE" if [ "$DEMOTE_NODE" != "$NODENAME" ]; then if ! echo $OCF_RESKEY_CRM_meta_notify_master_uname | tr '[A-Z]' '[a-z]' | grep $NODENAME; then show_master_baseline change_pgsql_status "$NODENAME" "HS:alone" fi fi return $OCF_SUCCESS } pgsql_pre_promote() { local master_baseline local my_master_baseline local cmp_location local number_of_nodes # If my data is newer than new master's one, I fail my resource. PROMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname | \ sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` number_of_nodes=`echo $NODE_LIST | wc -w` if [ $number_of_nodes -ge 3 -a \ "$OCF_RESKEY_rep_mode" = "sync" -a \ "$PROMOTE_NODE" != "$NODENAME" ]; then master_baseline=`$CRM_ATTR_REBOOT -N "$PROMOTE_NODE" -n \ "$PGSQL_MASTER_BASELINE" -G -q 2>/dev/null` if [ $? -eq 0 ]; then my_master_baseline=`$CRM_ATTR_REBOOT -N "$NODENAME" -n \ "$PGSQL_MASTER_BASELINE" -G -q 2>/dev/null` # get older location cmp_location=`printf "$master_baseline\n$my_master_baseline\n" |\ sort | head -1` if [ "$cmp_location" != "$my_master_baseline" ]; then ocf_exit_reason "My data is newer than new master's one. New master's location : $master_baseline" exec_with_retry 0 $CRM_FAILCOUNT -r $OCF_RESOURCE_INSTANCE -U $NODENAME -v INFINITY return $OCF_ERR_GENERIC fi fi fi return $OCF_SUCCESS } pgsql_notify() { local type="${OCF_RESKEY_CRM_meta_notify_type}" local op="${OCF_RESKEY_CRM_meta_notify_operation}" local rc if ! is_replication; then return $OCF_SUCCESS fi ocf_log debug "notify: ${type} for ${op}" case $type in pre) case $op in promote) pgsql_pre_promote return $? ;; esac ;; post) case $op in promote) delete_xlog_location PROMOTE_NODE=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname | \ sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` if [ "$PROMOTE_NODE" != "$NODENAME" ]; then delete_master_baseline fi return $OCF_SUCCESS ;; demote) pgsql_post_demote return $? ;; start|stop) MASTER_NODE=`echo $OCF_RESKEY_CRM_meta_notify_master_uname | \ sed "s/ /\n/g" | head -1 | tr '[A-Z]' '[a-z]'` if [ "$NODENAME" = "$MASTER_NODE" ]; then control_slave_status fi return $OCF_SUCCESS ;; esac ;; esac return $OCF_SUCCESS } control_slave_status() { local rc local data_status local target local all_data_status local tmp_data_status local number_of_nodes all_data_status=`exec_sql "${CHECK_REPLICATION_STATE_SQL}"` rc=$? if [ $rc -eq 0 ]; then if [ -n "$all_data_status" ]; then all_data_status=`echo $all_data_status | sed "s/\n/ /g"` fi else report_psql_error $rc err "Can't get PostgreSQL replication status." return 1 fi number_of_nodes=`echo $NODE_LIST | wc -w` for target in $NODE_LIST; do if [ "$target" = "$NODENAME" ]; then continue fi data_status="DISCONNECT" if [ -n "$all_data_status" ]; then for tmp_data_status in $all_data_status; do if ! echo $tmp_data_status | grep -q "^${target}|"; then continue fi data_status=`echo $tmp_data_status | cut -d "|" -f 2,3` ocf_log debug "node_name and data_status is $tmp_data_status" break done fi case "$data_status" in "STREAMING|SYNC") change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_PROMOTE" change_pgsql_status "$target" "HS:sync" ;; "STREAMING|ASYNC") change_data_status "$target" "$data_status" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then change_master_score "$target" "$CAN_NOT_PROMOTE" set_sync_mode "$target" else if [ $number_of_nodes -le 2 ]; then change_master_score "$target" "$CAN_PROMOTE" else # I can't determine which slave's data is newest in async mode. change_master_score "$target" "$CAN_NOT_PROMOTE" fi fi change_pgsql_status "$target" "HS:async" ;; "STREAMING|POTENTIAL") change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_NOT_PROMOTE" change_pgsql_status "$target" "HS:potential" ;; "DISCONNECT") change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_NOT_PROMOTE" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then set_async_mode "$target" fi ;; *) change_data_status "$target" "$data_status" change_master_score "$target" "$CAN_NOT_PROMOTE" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then set_async_mode "$target" fi change_pgsql_status "$target" "HS:connected" ;; esac done return 0 } have_master_right() { local old local new local output local data_status local node local mylocation local count local newestXlog local oldfile local newfile ocf_log debug "Checking if I have a master right." data_status=`$CRM_ATTR_FOREVER -N "$NODENAME" -n \ "$PGSQL_DATA_STATUS_ATTR" -G -q 2>/dev/null` if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then if [ -n "$data_status" -a "$data_status" != "STREAMING|SYNC" -a \ "$data_status" != "LATEST" ]; then ocf_log warn "My data is out-of-date. status=$data_status" return 1 fi else if [ -n "$data_status" -a "$data_status" != "STREAMING|SYNC" -a \ "$data_status" != "STREAMING|ASYNC" -a \ "$data_status" != "LATEST" ]; then ocf_log warn "My data is out-of-date. status=$data_status" return 1 fi fi ocf_log info "My data status=$data_status." show_xlog_location if [ $? -ne 0 ]; then ocf_exit_reason "Failed to show my xlog location." exit $OCF_ERR_GENERIC fi old=0 for count in `seq $OCF_RESKEY_xlog_check_count`; do if [ -f ${XLOG_NOTE_FILE}.$count ]; then old=$count continue fi break done new=`expr $old + 1` # get xlog locations of all nodes for node in ${NODE_LIST}; do output=`$CRM_ATTR_REBOOT -N "$node" -n \ "$PGSQL_XLOG_LOC_NAME" -G -q 2>/dev/null` if [ $? -ne 0 ]; then ocf_log warn "Can't get $node xlog location." continue else ocf_log info "$node xlog location : $output" echo "$node $output" >> ${XLOG_NOTE_FILE}.${new} if [ "$node" = "$NODENAME" ]; then mylocation=$output fi fi done oldfile=`cat ${XLOG_NOTE_FILE}.${old} 2>/dev/null` newfile=`cat ${XLOG_NOTE_FILE}.${new} 2>/dev/null` if [ "$oldfile" != "$newfile" ]; then # reset counter rm -f ${XLOG_NOTE_FILE}.* printf "$newfile\n" > ${XLOG_NOTE_FILE}.0 return 1 fi if [ "$new" -ge "$OCF_RESKEY_xlog_check_count" ]; then newestXlog=`printf "$newfile\n" | sort -t " " -k 2,3 -r | \ head -1 | cut -d " " -f 2` if [ "$newestXlog" = "$mylocation" ]; then ocf_log info "I have a master right." exec_with_retry 5 $CRM_MASTER -v $PROMOTE_ME return 0 fi change_data_status "$NODENAME" "DISCONNECT" ocf_log info "I don't have correct master data." # reset counter rm -f ${XLOG_NOTE_FILE}.* printf "$newfile\n" > ${XLOG_NOTE_FILE}.0 fi return 1 } is_replication() { if [ "$OCF_RESKEY_rep_mode" != "none" -a "$OCF_RESKEY_rep_mode" != "slave" ]; then return 0 fi return 1 } use_replication_slot() { if [ -n "$OCF_RESKEY_replication_slot_name" ]; then return 0 fi return 1 } create_replication_slot_name() { local number_of_nodes=0 local target local replication_slot_name local replication_slot_name_list_tmp local replication_slot_name_list if [ -n "$NODE_LIST" ]; then number_of_nodes=`echo $NODE_LIST | wc -w` fi # If the number of nodes 2 or less, Master node has 1 or less Slave node. # The Master node should have 1 slot for the Slave, which is named "$OCF_RES_KEY_replication_slot_name". if [ $number_of_nodes -le 2 ]; then replication_slot_name_list="$OCF_RESKEY_replication_slot_name" # If the number of nodes 3 or more, the Master has some Slave nodes. # The Master node should have some slots equal to the number of Slaves, and # the Slave nodes connect to their dedicated slot on the Master. # To ensuring that the slots name are each unique, add postfix to $OCF_RESKEY_replication_slot. # The postfix is "_$target". else for target in $NODE_LIST do if [ "$target" != "$NODENAME" ]; then # The Uppercase, "-" and "." don't allow to use in slot_name. # If the NODENAME contains them, convert upper case to lower case and "_" and "." to "_". target=`echo "$target" | tr '[A-Z.-]' '[a-z__]'` replication_slot_name="$OCF_RESKEY_replication_slot_name"_"$target" replication_slot_name_list_tmp="$replication_slot_name_list" replication_slot_name_list="$replication_slot_name_list_tmp $replication_slot_name" fi done fi echo $replication_slot_name_list } create_replication_slot() { local replication_slot_name local replication_slot_name_list local output local rc local CREATE_REPLICATION_SLOT_sql local DELETE_REPLICATION_SLOT_sql replication_slot_name_list=`create_replication_slot_name` ocf_log debug "replication slot names are $replication_slot_name_list." for replication_slot_name in $replication_slot_name_list do # If the same name slot is already exists, initialize(delete and create) the slot. if [ `check_replication_slot $replication_slot_name` = "1" ]; then DELETE_REPLICATION_SLOT_sql="SELECT pg_drop_replication_slot('$replication_slot_name');" output=`exec_sql "$DELETE_REPLICATION_SLOT_sql"` rc=$? if [ $rc -eq 0 ]; then ocf_log info "PostgreSQL delete the replication slot($replication_slot_name)." else ocf_exit_reason "$output" return $OCF_ERR_GENERIC fi fi CREATE_REPLICATION_SLOT_sql="SELECT pg_create_physical_replication_slot('$replication_slot_name');" output=`exec_sql "$CREATE_REPLICATION_SLOT_sql"` rc=$? if [ $rc -eq 0 ]; then ocf_log info "PostgreSQL creates the replication slot($replication_slot_name)." else ocf_exit_reason "$output" return $OCF_ERR_GENERIC fi done return 0 } # This function check the replication slot does exists. check_replication_slot(){ local replication_slot_name=$1 local output local CHECK_REPLICATION_SLOT_sql="SELECT count(*) FROM pg_replication_slots WHERE slot_name = '$replication_slot_name'" output=`exec_sql "$CHECK_REPLICATION_SLOT_sql"` echo "$output" } get_my_location() { local rc local output local replay_loc local receive_loc local output1 local output2 local log1 local log2 local newer_location output=`exec_sql "$CHECK_XLOG_LOC_SQL"` rc=$? if [ $rc -ne 0 ]; then report_psql_error $rc err "Can't get my xlog location." return 1 fi replay_loc=`echo $output | cut -d "|" -f 1` receive_loc=`echo $output | cut -d "|" -f 2` output1=`echo "$replay_loc" | cut -d "/" -f 1` output2=`echo "$replay_loc" | cut -d "/" -f 2` log1=`printf "%08s\n" $output1 | sed "s/ /0/g"` log2=`printf "%08s\n" $output2 | sed "s/ /0/g"` replay_loc="${log1}${log2}" output1=`echo "$receive_loc" | cut -d "/" -f 1` output2=`echo "$receive_loc" | cut -d "/" -f 2` log1=`printf "%08s\n" $output1 | sed "s/ /0/g"` log2=`printf "%08s\n" $output2 | sed "s/ /0/g"` receive_loc="${log1}${log2}" newer_location=`printf "$replay_loc\n$receive_loc" | sort -r | head -1` echo "$newer_location" return 0 } show_xlog_location() { local location location=`get_my_location` || return 1 exec_with_retry 0 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_XLOG_LOC_NAME" -v "$location" } delete_xlog_location() { exec_with_retry 5 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_XLOG_LOC_NAME" -D } show_master_baseline() { local rc local location location=`get_my_location` ocf_log info "My master baseline : $location." exec_with_retry 0 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_MASTER_BASELINE" -v "$location" } delete_master_baseline() { exec_with_retry 5 $CRM_ATTR_REBOOT -N "$NODENAME" -n "$PGSQL_MASTER_BASELINE" -D } set_async_mode_all() { [ "$OCF_RESKEY_rep_mode" = "sync" ] || return 0 ocf_log info "Set all nodes into async mode." runasowner -q err "echo \"synchronous_standby_names = ''\" > \"$REP_MODE_CONF\"" if [ $? -ne 0 ]; then ocf_exit_reason "Can't set all nodes into async mode." return 1 fi return 0 } set_async_mode() { cat $REP_MODE_CONF | grep -q -e "[,' ]$1[,' ]" if [ $? -eq 0 ]; then ocf_log info "Setup $1 into async mode." runasowner -q err "echo \"synchronous_standby_names = ''\" > \"$REP_MODE_CONF\"" else ocf_log debug "$1 is already in async mode." return 0 fi exec_with_retry 0 reload_conf } set_sync_mode() { local sync_node_in_conf sync_node_in_conf=`cat $REP_MODE_CONF | cut -d "'" -f 2` if [ -n "$sync_node_in_conf" ]; then ocf_log debug "$sync_node_in_conf is already sync mode." else ocf_log info "Setup $1 into sync mode." runasowner -q err "echo \"synchronous_standby_names = '$1'\" > \"$REP_MODE_CONF\"" [ "$RE_CONTROL_SLAVE" = "false" ] && RE_CONTROL_SLAVE="true" exec_with_retry 0 reload_conf fi } reload_conf() { # Invoke pg_ctl runasowner "$OCF_RESKEY_pgctl -D $OCF_RESKEY_pgdata reload" if [ $? -eq 0 ]; then ocf_log info "Reload configuration file." else ocf_exit_reason "Can't reload configuration file." return 1 fi return 0 } user_recovery_conf() { local number_of_nodes local nodename_tmp # put archive_cleanup_command and recovery_end_command only when defined by user if [ -n "$OCF_RESKEY_archive_cleanup_command" ]; then echo "archive_cleanup_command = '${OCF_RESKEY_archive_cleanup_command}'" fi if [ -n "$OCF_RESKEY_recovery_end_command" ]; then echo "recovery_end_command = '${OCF_RESKEY_recovery_end_command}'" fi if use_replication_slot; then number_of_nodes=`echo $NODE_LIST | wc -w` if [ $number_of_nodes -le 2 ]; then echo "primary_slot_name = '${OCF_RESKEY_replication_slot_name}'" else nodename_tmp=`echo "$NODENAME" | tr '[A-Z.-]' '[a-z__]'` echo "primary_slot_name = '${OCF_RESKEY_replication_slot_name}_$nodename_tmp'" fi fi } make_recovery_conf() { runasowner "touch $RECOVERY_CONF" if [ $? -ne 0 ]; then ocf_exit_reason "Can't create recovery.conf." return 1 fi cat > $RECOVERY_CONF <> $RECOVERY_CONF ocf_log debug "Created recovery.conf. host=${OCF_RESKEY_master_ip}, user=${OCF_RESKEY_repuser}" return 0 } # change pgsql-status. # arg1:node, arg2: value change_pgsql_status() { local output if ! is_node_online $1; then return 0 fi output=`$CRM_ATTR_REBOOT -N "$1" -n "$PGSQL_STATUS_ATTR" -G -q 2>/dev/null` if [ "$output" != "$2" ]; then # If slave's disk is broken, RA cannot read PID file # and misjudges the PostgreSQL as down while it is running. # It causes overwriting of pgsql-status by Master because replication is still connected. if [ "$output" = "STOP" -o "$output" = "UNKNOWN" ]; then if [ "$1" != "$NODENAME" ]; then ocf_log warn "Changing $PGSQL_STATUS_ATTR on $1 : $output->$2 by $NODENAME is prohibited." return 0 fi fi ocf_log info "Changing $PGSQL_STATUS_ATTR on $1 : $output->$2." exec_with_retry 0 $CRM_ATTR_REBOOT -N "$1" -n "$PGSQL_STATUS_ATTR" -v "$2" fi return 0 } # change pgsql-data-status. # arg1:node, arg2: value change_data_status() { local output if ! node_exist $1; then return 0 fi while : do output=`$CRM_ATTR_FOREVER -N "$1" -n "$PGSQL_DATA_STATUS_ATTR" -G -q 2>/dev/null` if [ "$output" != "$2" ]; then ocf_log info "Changing $PGSQL_DATA_STATUS_ATTR on $1 : $output->$2." exec_with_retry 0 exec_with_timeout 0 "$CRM_ATTR_FOREVER" -N $1 -n $PGSQL_DATA_STATUS_ATTR -v "$2" else break fi done return 0 } # set master-score # arg1:node, arg2: score, arg3: resoure set_master_score() { local current_score current_score=`$CRM_ATTR_REBOOT -N "$1" -n "master-$3" -G -q 2>/dev/null` if [ -n "$current_score" -a "$current_score" != "$2" ]; then ocf_log info "Changing $3 master score on $1 : $current_score->$2." exec_with_retry 0 $CRM_ATTR_REBOOT -N "$1" -n "master-$3" -v "$2" fi return 0 } # change master-score # arg1:node, arg2: score change_master_score() { local instance if ! is_node_online $1; then return 0 fi if echo $OCF_RESOURCE_INSTANCE | grep -q ":"; then # If Pacemaker version is 1.0.x instance=0 while : do if [ "$instance" -ge "$OCF_RESKEY_CRM_meta_clone_max" ]; then break fi if [ "${RESOURCE_NAME}:${instance}" = "$OCF_RESOURCE_INSTANCE" ]; then instance=`expr $instance + 1` continue fi set_master_score $1 $2 "${RESOURCE_NAME}:${instance}" || return 1 instance=`expr $instance + 1` done else # If globally-unique=false and Pacemaker version is 1.1.8 or higher # Master/Slave resource has no instance number set_master_score $1 $2 ${RESOURCE_NAME} || return 1 fi return 0 } report_psql_error() { local rc local loglevel local message rc=$1 loglevel=${2:-err} message="$3" ocf_log $loglevel "$message rc=$rc" if [ $rc -eq 1 ]; then ocf_exit_reason "Fatal error (out of memory, file not found, etc.) occurred while executing the psql command." elif [ $rc -eq 2 ]; then ocf_log $loglevel "Connection error (connection to the server went bad and the session was not interactive) occurred while executing the psql command." elif [ $rc -eq 3 ]; then ocf_exit_reason "Script error (the variable ON_ERROR_STOP was set) occurred while executing the psql command." fi } # # timeout management function # arg1 timeout >= 0 (if arg1 is 0, OCF_RESKEY_crm_attr_timeout is used.) # arg2 : command # arg3 : command's args exec_with_timeout() { local func_pid local count=$OCF_RESKEY_crm_attr_timeout local rc if [ "$1" -ne 0 ]; then count=$1 fi shift $* & func_pid=$! sleep .1 while kill -s 0 $func_pid >/dev/null 2>&1; do sleep 1 count=`expr $count - 1` if [ $count -le 0 ]; then ocf_exit_reason "\"$*\" (pid=$func_pid) timed out." kill -s 9 $func_pid >/dev/null 2>&1 return 1 fi ocf_log info "Waiting($count). \"$*\" (pid=$func_pid)." done wait $func_pid } # retry command when command doesn't return 0 # arg1 : count >= 0 (if arg1 is 0, it retries command in infinitum(1day)) # arg2..argN : command and args exec_with_retry() { local count="86400" local output local rc if [ "$1" -ne 0 ]; then count=$1 fi shift while [ $count -gt 0 ]; do output=`$*` rc=$? if [ $rc -ne 0 ]; then ocf_log warn "Retrying(remain $count). \"$*\" failed. rc=$rc. stdout=\"$output\"." count=`expr $count - 1` sleep 1 else printf "${output}" return 0 fi done ocf_exit_reason "giving up executing \"$*\"" return $rc } is_node_online() { print_crm_mon | tr '[A-Z]' '[a-z]' | grep -e "^node $1 " -e "^node $1:" | grep -q -v "offline" } node_exist() { print_crm_mon | tr '[A-Z]' '[a-z]' | grep -q "^node $1" } check_binary2() { if ! have_binary "$1"; then ocf_exit_reason "Setup problem: couldn't find command: $1" return 1 fi return 0 } check_config() { local rc=0 if [ ! -f "$1" ]; then if ocf_is_probe; then ocf_log info "Configuration file is $1 not readable during probe." rc=1 else ocf_exit_reason "Configuration file $1 doesn't exist" rc=2 fi fi return $rc } # Validate most critical parameters pgsql_validate_all() { local version local check_config_rc local rep_mode_string local socket_directories version=`cat $OCF_RESKEY_pgdata/PG_VERSION` if ! check_binary2 "$OCF_RESKEY_pgctl" || ! check_binary2 "$OCF_RESKEY_psql"; then return $OCF_ERR_INSTALLED fi check_config "$OCF_RESKEY_config" check_config_rc=$? [ $check_config_rc -eq 2 ] && return $OCF_ERR_INSTALLED if [ $check_config_rc -eq 0 ]; then ocf_version_cmp "$version" "9.3" if [ $? -eq 0 ]; then : ${OCF_RESKEY_socketdir=`get_pgsql_param unix_socket_directory`} else # unix_socket_directories is used by PostgreSQL 9.3 or higher. socket_directories=`get_pgsql_param unix_socket_directories` if [ -n "$socket_directories" ]; then # unix_socket_directories may have multiple socket directories and the pgsql RA can not know which directory is used for psql command. # Therefore, the user must set OCF_RESKEY_socketdir explicitly. if [ -z "$OCF_RESKEY_socketdir" ]; then ocf_exit_reason "In PostgreSQL 9.3 or higher, socketdir can't be empty if you define unix_socket_directories in the postgresql.conf." return $OCF_ERR_CONFIGURED fi fi fi fi getent passwd $OCF_RESKEY_pgdba >/dev/null 2>&1 if [ ! $? -eq 0 ]; then ocf_exit_reason "User $OCF_RESKEY_pgdba doesn't exist"; return $OCF_ERR_INSTALLED; fi if ocf_is_probe; then ocf_log info "Don't check $OCF_RESKEY_pgdata during probe" else if ! runasowner "test -w $OCF_RESKEY_pgdata"; then ocf_exit_reason "Directory $OCF_RESKEY_pgdata is not writable by $OCF_RESKEY_pgdba" return $OCF_ERR_PERM; fi fi if [ -n "$OCF_RESKEY_monitor_user" -a ! -n "$OCF_RESKEY_monitor_password" ] then ocf_exit_reason "monitor password can't be empty" return $OCF_ERR_CONFIGURED fi if [ ! -n "$OCF_RESKEY_monitor_user" -a -n "$OCF_RESKEY_monitor_password" ] then ocf_exit_reason "monitor_user has to be set if monitor_password is set" return $OCF_ERR_CONFIGURED fi if is_replication || [ "$OCF_RESKEY_rep_mode" = "slave" ]; then if [ `printf "$version\n9.1" | sort -n | head -1` != "9.1" ]; then ocf_exit_reason "Replication mode needs PostgreSQL 9.1 or higher." return $OCF_ERR_INSTALLED fi if [ ! -n "$OCF_RESKEY_master_ip" ]; then ocf_exit_reason "master_ip can't be empty." return $OCF_ERR_CONFIGURED fi fi if is_replication; then if ! ocf_is_ms; then ocf_exit_reason "Replication(rep_mode=async or sync) requires Master/Slave configuration." return $OCF_ERR_CONFIGURED fi if [ ! "$OCF_RESKEY_rep_mode" = "sync" -a ! "$OCF_RESKEY_rep_mode" = "async" ]; then ocf_exit_reason "Invalid rep_mode : $OCF_RESKEY_rep_mode" return $OCF_ERR_CONFIGURED fi if [ ! -n "$NODE_LIST" ]; then ocf_exit_reason "node_list can't be empty." return $OCF_ERR_CONFIGURED fi if [ $check_config_rc -eq 0 ]; then rep_mode_string="include '$REP_MODE_CONF' # added by pgsql RA" if [ "$OCF_RESKEY_rep_mode" = "sync" ]; then if ! grep -q "$rep_mode_string" $OCF_RESKEY_config; then ocf_log info "adding include directive into $OCF_RESKEY_config" echo "$rep_mode_string" >> $OCF_RESKEY_config fi else if grep -q "$rep_mode_string" $OCF_RESKEY_config; then ocf_log info "deleting include directive from $OCF_RESKEY_config" rep_mode_string=`echo $rep_mode_string | sed -e 's|/|\\\\/|g'` sed -i "/$rep_mode_string/d" $OCF_RESKEY_config fi fi fi if ! mkdir -p $OCF_RESKEY_tmpdir || ! chown $OCF_RESKEY_pgdba $OCF_RESKEY_tmpdir || ! chmod 700 $OCF_RESKEY_tmpdir; then ocf_exit_reason "Can't create directory $OCF_RESKEY_tmpdir or it is not readable by $OCF_RESKEY_pgdba" return $OCF_ERR_PERM fi fi if [ "$OCF_RESKEY_rep_mode" = "slave" ]; then if ocf_is_ms; then ocf_exit_reason "Replication(rep_mode=slave) does not support Master/Slave configuration." return $OCF_ERR_CONFIGURED fi fi if use_replication_slot; then ocf_version_cmp "$version" "9.4" if [ $? -eq 0 -o $? -eq 3 ]; then ocf_exit_reason "Replication slot needs PostgreSQL 9.4 or higher." return $OCF_ERR_CONFIGURED fi echo "$OCF_RESKEY_replication_slot_name" | grep -q -e [^a-z0-9_] if [ $? -eq 0 ]; then ocf_exit_reason "Invalid replication_slot_name($OCF_RESKEY_replication_slot_name). only use lower case letters, numbers, and the underscore character." return $OCF_ERR_CONFIGURED fi fi return $OCF_SUCCESS } # # Check if we need to create a log file # check_log_file() { if [ ! -f "$1" ] then touch $1 > /dev/null 2>&1 chown $OCF_RESKEY_pgdba:`getent passwd $OCF_RESKEY_pgdba | cut -d ":" -f 4` $1 fi #Check if $OCF_RESKEY_pgdba can write to the log file if ! runasowner "test -w $1" then return 1 fi return 0 } # # Check socket directory # check_socket_dir() { if [ ! -d "$OCF_RESKEY_socketdir" ]; then if ! mkdir "$OCF_RESKEY_socketdir"; then ocf_exit_reason "Can't create directory $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi if ! chown $OCF_RESKEY_pgdba:`getent passwd \ $OCF_RESKEY_pgdba | cut -d ":" -f 4` "$OCF_RESKEY_socketdir" then ocf_exit_reason "Can't change ownership for $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi if ! chmod 2775 "$OCF_RESKEY_socketdir"; then ocf_exit_reason "Can't change permissions for $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi else if ! runasowner "touch $OCF_RESKEY_socketdir/test.$$"; then ocf_exit_reason "$OCF_RESKEY_pgdba can't create files in $OCF_RESKEY_socketdir" exit $OCF_ERR_PERM fi rm $OCF_RESKEY_socketdir/test.$$ fi } print_crm_mon() { if [ -z "$CRM_MON_OUTPUT" ]; then CRM_MON_OUTPUT=`exec_with_retry 0 crm_mon -n1` fi printf "${CRM_MON_OUTPUT}\n" } # # 'main' starts here... # if [ $# -ne 1 ] then usage exit $OCF_ERR_GENERIC fi PIDFILE=${OCF_RESKEY_pgdata}/postmaster.pid BACKUPLABEL=${OCF_RESKEY_pgdata}/backup_label RESOURCE_NAME=`echo $OCF_RESOURCE_INSTANCE | cut -d ":" -f 1` PGSQL_WAL_RECEIVER_STATUS_ATTR="${RESOURCE_NAME}-receiver-status" RECOVERY_CONF=${OCF_RESKEY_pgdata}/recovery.conf NODENAME=$(ocf_local_nodename | tr '[A-Z]' '[a-z]') if is_replication; then REP_MODE_CONF=${OCF_RESKEY_tmpdir}/rep_mode.conf PGSQL_LOCK=${OCF_RESKEY_tmpdir}/PGSQL.lock XLOG_NOTE_FILE=${OCF_RESKEY_tmpdir}/xlog_note CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot" CRM_ATTR_REBOOT="${HA_SBIN_DIR}/crm_attribute -l reboot" CRM_ATTR_FOREVER="${HA_SBIN_DIR}/crm_attribute -l forever" CRM_FAILCOUNT="${HA_SBIN_DIR}/crm_failcount" CAN_NOT_PROMOTE="-INFINITY" CAN_PROMOTE="100" PROMOTE_ME="1000" CHECK_MS_SQL="select pg_is_in_recovery()" CHECK_XLOG_LOC_SQL="select pg_last_xlog_replay_location(),pg_last_xlog_receive_location()" CHECK_REPLICATION_STATE_SQL="select application_name,upper(state),upper(sync_state) from pg_stat_replication" PGSQL_STATUS_ATTR="${RESOURCE_NAME}-status" PGSQL_DATA_STATUS_ATTR="${RESOURCE_NAME}-data-status" PGSQL_XLOG_LOC_NAME="${RESOURCE_NAME}-xlog-loc" PGSQL_MASTER_BASELINE="${RESOURCE_NAME}-master-baseline" NODE_LIST=`echo $OCF_RESKEY_node_list | tr '[A-Z]' '[a-z]'` RE_CONTROL_SLAVE="false" fi case "$1" in methods) pgsql_methods exit $?;; meta-data) meta_data exit $OCF_SUCCESS;; esac pgsql_validate_all rc=$? [ "$1" = "validate-all" ] && exit $rc if [ $rc -ne 0 ] then case "$1" in stop) if is_replication; then change_pgsql_status "$NODENAME" "UNKNOWN" fi exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; status) exit $OCF_NOT_RUNNING;; *) exit $rc;; esac fi US=`id -u -n` if [ $US != root -a $US != $OCF_RESKEY_pgdba ] then ocf_exit_reason "$0 must be run as root or $OCF_RESKEY_pgdba" exit $OCF_ERR_GENERIC fi # make psql command options if [ -n "$OCF_RESKEY_monitor_user" ]; then PGUSER=$OCF_RESKEY_monitor_user; export PGUSER PGPASSWORD=$OCF_RESKEY_monitor_password; export PGPASSWORD psql_options="-p $OCF_RESKEY_pgport $OCF_RESKEY_pgdb" else psql_options="-p $OCF_RESKEY_pgport -U $OCF_RESKEY_pgdba $OCF_RESKEY_pgdb" fi if [ -n "$OCF_RESKEY_pghost" ]; then psql_options="$psql_options -h $OCF_RESKEY_pghost" else if [ -n "$OCF_RESKEY_socketdir" ]; then psql_options="$psql_options -h $OCF_RESKEY_socketdir" fi fi if [ -n "$OCF_RESKEY_pgport" ]; then export PGPORT=$OCF_RESKEY_pgport fi if [ -n "$OCF_RESKEY_pglibs" ]; then if [ -n "$LD_LIBRARY_PATH" ]; then export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$OCF_RESKEY_pglibs else export LD_LIBRARY_PATH=$OCF_RESKEY_pglibs fi fi # What kind of method was invoked? case "$1" in status) if pgsql_status then ocf_log info "PostgreSQL is up" exit $OCF_SUCCESS else ocf_log info "PostgreSQL is down" exit $OCF_NOT_RUNNING fi;; monitor) pgsql_monitor exit $?;; start) pgsql_start exit $?;; promote) pgsql_promote exit $?;; demote) pgsql_demote exit $?;; notify) pgsql_notify exit $?;; stop) pgsql_stop exit $?;; *) exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/rabbitmq-cluster b/heartbeat/rabbitmq-cluster index cc45f09ad..966dd64d1 100755 --- a/heartbeat/rabbitmq-cluster +++ b/heartbeat/rabbitmq-cluster @@ -1,370 +1,465 @@ #!/bin/sh # # Copyright (c) 2014 David Vossel # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### RMQ_SERVER=/usr/sbin/rabbitmq-server RMQ_CTL=/usr/sbin/rabbitmqctl RMQ_DATA_DIR="/var/lib/rabbitmq/mnesia" RMQ_PID_DIR="/var/run/rabbitmq" RMQ_PID_FILE="/var/run/rabbitmq/rmq.pid" RMQ_LOG_DIR="/var/log/rabbitmq" NODENAME=$(ocf_local_nodename) +# this attr represents the current active local rmq node name. +# when rmq stops or the node is fenced, this attr disappears RMQ_CRM_ATTR_COOKIE="rmq-node-attr-${OCF_RESOURCE_INSTANCE}" +# this attr represents the last known active local rmq node name +# when rmp stops or the node is fenced, the attr stays forever so +# we can continue to map an offline pcmk node to it's rmq node name +# equivalent. +RMQ_CRM_ATTR_COOKIE_LAST_KNOWN="rmq-node-attr-last-known-${OCF_RESOURCE_INSTANCE}" meta_data() { cat < 1.0 Starts cloned rabbitmq cluster instance rabbitmq clustered Policy string to pass to 'rabbitmqctl set_policy' right after bootstrapping the first rabbitmq instance. rabbitmqctl set_policy args END } ####################################################################### rmq_usage() { cat < /dev/null 2>&1 } rmq_local_node() { local node_name=$(rabbitmqctl status 2>&1 | sed -n -e "s/^.*[S|s]tatus of node \(.*\)\s.*$/\1/p" | tr -d "'") if [ -z "$node_name" ]; then node_name=$(cat /etc/rabbitmq/rabbitmq-env.conf 2>/dev/null | grep "\s*RABBITMQ_NODENAME=" | awk -F= '{print $2}') fi echo "$node_name" } rmq_join_list() { cibadmin -Q --xpath "//node_state[@crmd='online']//nvpair[@name='$RMQ_CRM_ATTR_COOKIE']" | grep "$RMQ_CRM_ATTR_COOKIE" | sed -n -e "s/^.*value=.\(.*\)\".*$/\1/p" } rmq_write_nodename() { local node_name=$(rmq_local_node) if [ -z "$node_name" ]; then ocf_log err "Failed to determine rabbitmq node name, exiting" exit $OCF_ERR_GENERIC fi - # store the pcmknode to rmq node mapping as an attribute + # store the pcmknode to rmq node mapping as a transient attribute. This allows + # us to retrieve the join list with a simple xpath. ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "$RMQ_CRM_ATTR_COOKIE" -v "$node_name" + + # the pcmknode to rmq node mapping as a permanent attribute as well. this lets + # us continue to map offline nodes to their equivalent rmq node name + ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l forever --name "$RMQ_CRM_ATTR_COOKIE_LAST_KNOWN" -v "$node_name" } rmq_delete_nodename() { # remove node-name ${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "$RMQ_CRM_ATTR_COOKIE" -D } prepare_dir () { if [ ! -d ${1} ] ; then mkdir -p ${1} chown -R rabbitmq:rabbitmq ${1} chmod 755 ${1} fi } remove_pid () { rm -f ${RMQ_PID_FILE} > /dev/null 2>&1 } rmq_monitor() { local rc $RMQ_CTL cluster_status > /dev/null 2>&1 rc=$? case "$rc" in 0) ocf_log debug "RabbitMQ server is running normally" rmq_write_nodename return $OCF_SUCCESS ;; - 2) + 2|68|69|70|75|78) ocf_log info "RabbitMQ server is not running" rmq_delete_nodename return $OCF_NOT_RUNNING ;; *) - ocf_log err "Unexpected return code from '$RMQ_CTL cluster status' exit code: $rc" + ocf_log err "Unexpected return code from '$RMQ_CTL cluster_status' exit code: $rc" rmq_delete_nodename return $OCF_ERR_GENERIC ;; esac } rmq_init_and_wait() { local rc prepare_dir $RMQ_PID_DIR prepare_dir $RMQ_LOG_DIR remove_pid # the server startup script uses this environment variable export RABBITMQ_PID_FILE="$RMQ_PID_FILE" setsid sh -c "$RMQ_SERVER > ${RMQ_LOG_DIR}/startup_log 2> ${RMQ_LOG_DIR}/startup_err" & ocf_log info "Waiting for server to start" $RMQ_CTL wait $RMQ_PID_FILE rc=$? if [ $rc -ne $OCF_SUCCESS ]; then remove_pid ocf_log info "rabbitmq-server start failed: $rc" return $OCF_ERR_GENERIC fi rmq_monitor return $? } rmq_set_policy() { $RMQ_CTL set_policy $@ > /dev/null 2>&1 } rmq_start_first() { local rc ocf_log info "Bootstrapping rabbitmq cluster" rmq_wipe_data rmq_init_and_wait rc=$? if [ $rc -eq 0 ]; then rc=$OCF_SUCCESS ocf_log info "cluster bootstrapped" if [ -n "$OCF_RESKEY_set_policy" ]; then # do not quote set_policy, we are passing in arguments rmq_set_policy $OCF_RESKEY_set_policy > /dev/null 2>&1 if [ $? -ne 0 ]; then ocf_log err "Failed to set policy: $OCF_RESKEY_set_policy" rc=$OCF_ERR_GENERIC else ocf_log info "Policy set: $OCF_RESKEY_set_policy" fi fi else ocf_log info "failed to bootstrap cluster. Check SELINUX policy" rc=$OCF_ERR_GENERIC fi return $rc } +rmq_is_clustered() +{ + $RMQ_CTL eval 'rabbit_mnesia:is_clustered().' | grep -q true +} + rmq_join_existing() { local join_list="$1" local rc=$OCF_ERR_GENERIC ocf_log info "Joining existing cluster with [ $(echo $join_list | tr '\n' ' ') ] nodes." rmq_init_and_wait if [ $? -ne 0 ]; then return $OCF_ERR_GENERIC fi + if rmq_is_clustered; then + ocf_log info "Successfully re-joined existing rabbitmq cluster automatically" + return $OCF_SUCCESS + fi + # unconditionally join the cluster $RMQ_CTL stop_app > /dev/null 2>&1 for node in $(echo "$join_list"); do ocf_log info "Attempting to join cluster with target node $node" $RMQ_CTL join_cluster $node if [ $? -eq 0 ]; then ocf_log info "Joined cluster by connecting to node $node, starting app" $RMQ_CTL start_app rc=$? if [ $rc -ne 0 ]; then ocf_log err "'$RMQ_CTL start_app' failed" fi break; fi done if [ "$rc" -ne 0 ]; then ocf_log info "Join process incomplete, shutting down." return $OCF_ERR_GENERIC fi ocf_log info "Successfully joined existing rabbitmq cluster" return $OCF_SUCCESS } +rmq_forget_cluster_node_remotely() { + local running_cluster_nodes="$1" + local node_to_forget="$2" + + ocf_log info "Forgetting $node_to_forget via nodes [ $(echo $running_cluster_nodes | tr '\n' ' ') ]." + for running_cluster_node in $running_cluster_nodes; do + rabbitmqctl -n $running_cluster_node forget_cluster_node $node_to_forget + if [ $? = 0 ]; then + ocf_log info "Succeeded forgetting $node_to_forget via $running_cluster_node." + return + else + ocf_log err "Failed to forget node $node_to_forget via $running_cluster_node." + fi + done +} + +rmq_notify() { + node_list="${OCF_RESKEY_CRM_meta_notify_stop_uname}" + mode="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" + + + # When notifications are on, this agent is going to "forget" nodes once they + # leave the cluster. This is thought to resolve some issues where rabbitmq + # blocks trying to sync with an offline node after a fencing action occurs. + if ! [ "${mode}" = "post-stop" ]; then + return $OCF_SUCCESS + fi + + rmq_monitor + if [ $? -ne $OCF_SUCCESS ]; then + # only run forget when we are for sure active + return $OCF_SUCCESS + fi + + # forget each stopped rmq instance in the provided pcmk node in the list. + for node in $(echo "$node_list"); do + local rmq_node="$(${HA_SBIN_DIR}/crm_attribute -N $node -l forever --query --name $RMQ_CRM_ATTR_COOKIE_LAST_KNOWN -q)" + if [ -z "$rmq_node" ]; then + ocf_log warn "Unable to map pcmk node $node to a known rmq node." + continue + fi + ocf_log notice "Forgetting stopped node $rmq_node" + $RMQ_CTL forget_cluster_node $rmq_node + if [ $? -ne 0 ]; then + ocf_log warn "Unable to forget offline node $rmq_node." + fi + done + return $OCF_SUCCESS +} + rmq_start() { local join_list="" local rc rmq_monitor if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS fi join_list=$(rmq_join_list) # No join list means no active instances are up. This instance # is the first, so it needs to bootstrap the rest if [ -z "$join_list" ]; then rmq_start_first rc=$? return $rc fi # first try to join without wiping mnesia data rmq_join_existing "$join_list" if [ $? -ne 0 ]; then ocf_log info "node failed to join, wiping data directory and trying again" + local local_rmq_node="$(${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l forever --query --name $RMQ_CRM_ATTR_COOKIE_LAST_KNOWN -q)" + # if the graceful join fails, use the hammer and reset all the data. rmq_stop rmq_wipe_data + rmq_forget_cluster_node_remotely "$join_list" "$local_rmq_node" rmq_join_existing "$join_list" - if [ $? -ne 0 ]; then + rc=$? + + # Restore users (if any) + BaseDataDir=`dirname $RMQ_DATA_DIR` + if [ -f $BaseDataDir/users.erl ] ; then + rabbitmqctl eval " + {ok, [Users]} = file:consult(\"$BaseDataDir/users.erl\"), + lists:foreach(fun(X) -> mnesia:dirty_write(rabbit_user, X) end, Users). + " + rm -f $BaseDataDir/users.erl + fi + + if [ $rc -ne 0 ]; then ocf_log info "node failed to join even after reseting local data. Check SELINUX policy" return $OCF_ERR_GENERIC fi fi return $OCF_SUCCESS } rmq_stop() { + # Backup users + BaseDataDir=`dirname $RMQ_DATA_DIR` + rabbitmqctl eval " + Users = mnesia:dirty_select(rabbit_user, [{ {internal_user, '\\\$1', '_', '_'}, [{'/=', '\\\$1', <<\"guest\">>}], ['\\\$_'] } ]), + file:write_file(\"$BaseDataDir/users.erl\", io_lib:fwrite(\"~p.~n\", [Users])). + " + rmq_monitor if [ $? -eq $OCF_NOT_RUNNING ]; then return $OCF_SUCCESS fi $RMQ_CTL stop rc=$? if [ $rc -ne 0 ]; then ocf_log err "rabbitmq-server stop command failed: $RMQ_CTL stop, $rc" return $rc fi #TODO add kill logic stop_wait=1 while [ $stop_wait = 1 ]; do rmq_monitor rc=$? if [ "$rc" -eq $OCF_NOT_RUNNING ]; then stop_wait=0 break elif [ "$rc" -ne $OCF_SUCCESS ]; then ocf_log info "rabbitmq-server stop failed: $rc" exit $OCF_ERR_GENERIC fi sleep 1 done remove_pid return $OCF_SUCCESS } rmq_validate() { check_binary $RMQ_SERVER check_binary $RMQ_CTL # This resource only makes sense as a clone right now. at some point # we may want to verify the following. #TODO verify cloned #TODO verify ordered=true # Given that this resource does the cluster join explicitly, # having a cluster_nodes list in the static config file will # likely conflict with this agent. #TODO verify no cluster list in rabbitmq conf #cat /etc/rabbitmq/rabbitmq.config | grep "cluster_nodes" return $OCF_SUCCESS } case $__OCF_ACTION in meta-data) meta_data exit $OCF_SUCCESS ;; start) rmq_start;; stop) rmq_stop;; monitor) rmq_monitor;; validate-all) rmq_validate;; +notify) rmq_notify;; usage|help) rmq_usage exit $OCF_SUCCESS ;; *) rmq_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac rc=$? ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" exit $rc diff --git a/heartbeat/sapdb.sh b/heartbeat/sapdb.sh index 7edb4b88d..03474fc05 100755 --- a/heartbeat/sapdb.sh +++ b/heartbeat/sapdb.sh @@ -1,340 +1,367 @@ # # sapdb.sh - for systems having SAPHostAgent installed # (sourced by SAPDatabase) # # Description: This code is separated from the SAPDatabase agent to # introduce new functions for systems which having # SAPHostAgent installed. # Someday it might be merged back into SAPDatabase agein. # # Author: Alexander Krauth, September 2010 # Support: linux@sap.com # License: GNU General Public License (GPL) # Copyright: (c) 2010, 2012 Alexander Krauth # # # background_check_saphostexec : Run a request to saphostexec in a separat task, to be able to react on a hanging process # background_check_saphostexec() { timeout=600 count=0 $SAPHOSTCTRL -function ListDatabases >/dev/null 2>&1 & pid=$! while kill -0 $pid > /dev/null 2>&1 do sleep 0.1 count=$(( $count + 1 )) if [ $count -ge $timeout ]; then kill -9 $pid >/dev/null 2>&1 ocf_log warn "saphostexec did not respond to the method 'ListDatabases' within 60 seconds" return $OCF_ERR_GENERIC # Timeout fi done # child already has finished, now evaluate it's returncode wait $pid } # # cleanup_saphostexec : make sure to cleanup the SAPHostAgent in case of any # misbehavior # cleanup_saphostexec() { pkill -9 -f "$SAPHOSTEXEC" pkill -9 -f "$SAPHOSTSRV" oscolpid=`pgrep -f "$SAPHOSTOSCOL"` # we check saposcol pid, because it # might not run under control of # saphostexec # cleanup saposcol shared memory, otherwise it will not start again if [ -n "$oscolpid" ];then kill -9 $oscolpid oscolipc=`ipcs -m | grep "4dbe " | awk '{print $2}'` if [ -n "$oscolipc" ]; then ipcrm -m $oscolipc fi fi # removing the unix domain socket file as it might have wrong permissions or # ownership - it will be recreated by saphostexec during next start [ -r /tmp/.sapstream1128 ] && rm -f /tmp/.sapstream1128 } # # check_saphostexec : Before using saphostctrl we make sure that the # saphostexec is running on the current node. # check_saphostexec() { chkrc=$OCF_SUCCESS running=`pgrep -f "$SAPHOSTEXEC" | wc -l` if [ $running -gt 0 ]; then if background_check_saphostexec; then return $OCF_SUCCESS else ocf_log warn "saphostexec did not respond to the method 'ListDatabases' correctly (rc=$?), it will be killed now" running=0 fi fi if [ $running -eq 0 ]; then ocf_log warn "saphostexec is not running on node `hostname`, it will be started now" cleanup_saphostexec output=`$SAPHOSTEXEC -restart 2>&1` # now make sure the daemon has been started and is able to respond srvrc=1 while [ $srvrc -ne 0 -a `pgrep -f "$SAPHOSTEXEC" | wc -l` -gt 0 ] do sleep 1 background_check_saphostexec srvrc=$? done if [ $srvrc -eq 0 ] then ocf_log info "saphostexec on node `hostname` was restarted !" chkrc=$OCF_SUCCESS else ocf_log error "saphostexec on node `hostname` could not be started! - $output" chkrc=$OCF_ERR_GENERIC fi fi return $chkrc } # # sapdatabase_start : Start the SAP database # sapdatabase_start() { check_saphostexec rc=$? if [ $rc -eq $OCF_SUCCESS ] then sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT" DBINST="" if [ -n "$OCF_RESKEY_DBINSTANCE" ] then DBINST="-dbinstance $OCF_RESKEY_DBINSTANCE " fi FORCE="" if ocf_is_true $OCF_RESKEY_AUTOMATIC_RECOVER then FORCE="-force" fi - output=`$SAPHOSTCTRL -function StartDatabase -dbname $SID -dbtype $DBTYPE $DBINST $FORCE -service` + DBOSUSER="" + if [ -n "$OCF_RESKEY_DBOSUSER" ] + then + DBOSUSER="-dbuser $OCF_RESKEY_DBOSUSER " + fi + output=`$SAPHOSTCTRL -function StartDatabase -dbname $SID -dbtype $DBTYPE $DBINST $DBOSUSER $FORCE -service` sapdatabase_monitor 1 rc=$? if [ $rc -eq 0 ] then ocf_log info "SAP database $SID started: $output" rc=$OCF_SUCCESS sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT" else ocf_log err "SAP database $SID start failed: $output" rc=$OCF_ERR_GENERIC fi fi return $rc } # # sapdatabase_stop: Stop the SAP database # sapdatabase_stop() { check_saphostexec rc=$? if [ $rc -eq $OCF_SUCCESS ] then sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT" DBINST="" if [ -n "$OCF_RESKEY_DBINSTANCE" ] then DBINST="-dbinstance $OCF_RESKEY_DBINSTANCE " fi - output=`$SAPHOSTCTRL -function StopDatabase -dbname $SID -dbtype $DBTYPE $DBINST -force -service` + DBOSUSER="" + if [ -n "$OCF_RESKEY_DBOSUSER" ] + then + DBOSUSER="-dbuser $OCF_RESKEY_DBOSUSER " + fi + output=`$SAPHOSTCTRL -function StopDatabase -dbname $SID -dbtype $DBTYPE $DBINST $DBOSUSER -force -service` if [ $? -eq 0 ] then ocf_log info "SAP database $SID stopped: $output" rc=$OCF_SUCCESS else ocf_log err "SAP database $SID stop failed: $output" rc=$OCF_ERR_GENERIC fi fi sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT" return $rc } # # sapdatabase_monitor: Can the given database instance do anything useful? # sapdatabase_monitor() { strict=$1 rc=$OCF_SUCCESS if ! ocf_is_true $strict then sapdatabase_status rc=$? else check_saphostexec rc=$? if [ $rc -eq $OCF_SUCCESS ] then count=0 DBINST="" if [ -n "$OCF_RESKEY_DBINSTANCE" ] then DBINST="-dbinstance $OCF_RESKEY_DBINSTANCE " fi - output=`$SAPHOSTCTRL -function GetDatabaseStatus -dbname $SID -dbtype $DBTYPE $DBINST` + if [ -n "$OCF_RESKEY_DBOSUSER" ] + then + DBOSUSER="-dbuser $OCF_RESKEY_DBOSUSER " + fi + output=`$SAPHOSTCTRL -function GetDatabaseStatus -dbname $SID -dbtype $DBTYPE $DBINST $DBOSUSER` # we have to parse the output, because the returncode doesn't tell anything about the instance status for SERVICE in `echo "$output" | grep -i 'Component[ ]*Name *[:=] [A-Za-z][A-Za-z0-9_]* (' | sed 's/^.*Component[ ]*Name *[:=] *\([A-Za-z][A-Za-z0-9_]*\).*$/\1/i'` do - COLOR=`echo "$output" | grep -i "Component[ ]*Name *[:=] *$SERVICE (" | sed 's/^.*Status *[:=] *\([A-Za-z][A-Za-z0-9_]*\).*$/\1/i'` + COLOR=`echo "$output" | grep -i "Component[ ]*Name *[:=] *$SERVICE (" | sed 's/^.*Status *[:=] *\([A-Za-z][A-Za-z0-9_]*\).*$/\1/i' | uniq` STATE=0 case $COLOR in Running) STATE=$OCF_SUCCESS;; *) STATE=$OCF_NOT_RUNNING;; esac SEARCH=`echo "$OCF_RESKEY_MONITOR_SERVICES" | sed 's/\+/\\\+/g' | sed 's/\./\\\./g'` if [ `echo "$SERVICE" | egrep -c "$SEARCH"` -eq 1 ] then if [ $STATE -eq $OCF_NOT_RUNNING ] then ocf_log err "SAP database service $SERVICE is not running with status $COLOR !" rc=$STATE fi count=1 fi done if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ] then ocf_log err "The resource does not run any services which this RA could monitor!" rc=$OCF_ERR_ARGS fi if [ $rc -ne $OCF_SUCCESS ] then ocf_log err "The SAP database $SID is not running: $output" fi fi fi return $rc } # # sapdatabase_status: Are there any database processes on this host ? # sapdatabase_status() { + sid=`echo $SID | tr '[:upper:]' '[:lower:]'` + + SUSER=${OCF_RESKEY_DBOSUSER:-""} + case $DBTYPE in ADA) SEARCH="$SID/db/pgm/kernel" - SUSER=`grep "^SdbOwner" /etc/opt/sdb | awk -F'=' '{print $2}'` + [ -z "$SUSER" ] && SUSER=`grep "^SdbOwner" /etc/opt/sdb | awk -F'=' '{print $2}'` SNUM=2 ;; - ORA) SEARCH="ora_[a-z][a-z][a-z][a-z]_" - SUSER="ora`echo $SID | tr '[:upper:]' '[:lower:]'`" - SNUM=4 + ORA) DBINST=${OCF_RESKEY_DBINSTANCE} + DBINST=${OCF_RESKEY_DBINSTANCE:-${SID}} + SEARCH="ora_[a-z][a-z][a-z][a-z]_$DBINST" + + if [ -z "$SUSER" ]; then + id "oracle" > /dev/null 2> /dev/null && SUSER="oracle" + id "ora${sid}" > /dev/null 2> /dev/null && SUSER="${SUSER:+${SUSER},}ora${sid}" + fi + + SNUM=4 ;; DB6) SEARCH="db2[a-z][a-z][a-z]" - SUSER="db2`echo $SID | tr '[:upper:]' '[:lower:]'`" + [ -z "$SUSER" ] && SUSER="db2${sid}" SNUM=2 ;; SYB) SEARCH="dataserver" - SUSER="syb`echo $SID | tr '[:upper:]' '[:lower:]'`" + [ -z "$SUSER" ] && SUSER="syb${sid}" SNUM=1 ;; HDB) SEARCH="hdb[a-z]*server" - SUSER="`echo $SID | tr '[:upper:]' '[:lower:]'`adm" + [ -z "$SUSER" ] && SUSER="${sid}adm" SNUM=1 ;; esac - cnt=`ps -u $SUSER -o args 2> /dev/null | grep -c $SEARCH` + [ -z "$SUSER" ] && return $OCF_ERR_INSTALLED + + cnt=`ps -u $SUSER -o args 2> /dev/null | grep -v grep | grep -c $SEARCH` [ $cnt -ge $SNUM ] && return $OCF_SUCCESS return $OCF_NOT_RUNNING } # # sapdatabase_recover: # sapdatabase_recover() { OCF_RESKEY_AUTOMATIC_RECOVER=1 sapdatabase_stop sapdatabase_start } # # sapdatabase_validate: Check the symantic of the input parameters # sapdatabase_validate() { rc=$OCF_SUCCESS if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ] then ocf_log err "Parsing parameter SID: '$SID' is not a valid system ID!" rc=$OCF_ERR_ARGS fi case "$DBTYPE" in ORA|ADA|DB6|SYB|HDB) ;; *) ocf_log err "Parsing parameter DBTYPE: '$DBTYPE' is not a supported database type!" rc=$OCF_ERR_ARGS ;; esac return $rc } # # sapdatabase_init: initialize global variables at the beginning # sapdatabase_init() { OCF_RESKEY_AUTOMATIC_RECOVER_default=0 : ${OCF_RESKEY_AUTOMATIC_RECOVER=${OCF_RESKEY_AUTOMATIC_RECOVER_default}} if [ -z "$OCF_RESKEY_MONITOR_SERVICES" ] then case $DBTYPE in ORA) export OCF_RESKEY_MONITOR_SERVICES="Instance|Database|Listener" ;; ADA) export OCF_RESKEY_MONITOR_SERVICES="Database" ;; DB6) db2sid="db2`echo $SID | tr '[:upper:]' '[:lower:]'`" export OCF_RESKEY_MONITOR_SERVICES="${SID}|${db2sid}" ;; SYB) export OCF_RESKEY_MONITOR_SERVICES="Server" ;; HDB) export OCF_RESKEY_MONITOR_SERVICES="hdbindexserver" ;; esac fi } diff --git a/heartbeat/sg_persist b/heartbeat/sg_persist index 1ce0a64c0..4d518ef0e 100755 --- a/heartbeat/sg_persist +++ b/heartbeat/sg_persist @@ -1,673 +1,674 @@ #!/bin/bash # # # OCF Resource Agent compliant PERSISTENT SCSI RESERVATION resource script. # # # Copyright (c) 2011 Evgeny Nifontov and lwang@suse.com All Rights Reserved. # # "Heartbeat drbd OCF Resource Agent: 2007, Lars Marowsky-Bree" was used # as example of multistate OCF Resource Agent. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # # OCF instance parameters # OCF_RESKEY_binary # OCF_RESKEY_devs # OCF_RESKEY_required_devs_nof # OCF_RESKEY_reservation_type # OCF_RESKEY_master_score_base # OCF_RESKEY_master_score_dev_factor # OCF_RESKEY_master_score_delay # # TODO # # 1) PROBLEM: devices which were not accessible during 'start' action, will be never registered/reserved # TODO: 'Master' and 'Salve' registers new devs in 'monitor' action # TODO: 'Master' reserves new devs in 'monitor' action ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # set default values : ${sg_persist_binary="sg_persist"} # binary name for the resource : ${devs=""} # device list : ${required_devs_nof=1} # number of required devices : ${reservation_type=1} # reservation type : ${master_score_base=0} # master score base : ${master_score_dev_factor=100} # device factor for master score : ${master_score_delay=30} # delay for master score ####################################################################### meta_data() { cat < 1.1 This resource agent manages SCSI PERSISTENT RESERVATIONS. "sg_persist" from sg3_utils is used, please see its documentation. Should be used as multistate (Master/Slave) resource Slave registers its node id ("crm_node -i") as reservation key ( --param-rk ) on each device in the "devs" list. Master reservs all devices from "devs" list with reservation "--prout-type" value from "reservation_type" parameter. Manages SCSI PERSISTENT RESERVATIONS The name of the binary that manages the resource. the binary name of the resource Device list. Multiple devices can be listed with blank space as separator. Shell wildcars are allowed. device list Minimum number of "working" devices from device list 1) existing 2) "sg_persist --read-keys \$device" works (Return code 0) resource actions "start","monitor","promote" and "validate-all" return "\$OCF_ERR_INSTALLED" if the actual number of "working" devices is less then "required_devs_nof". resource actions "stop" and "demote" tries to remove reservations and registration keys from all working devices, but always return "\$OCF_SUCCESS" minimum number of working devices reservation type reservation type master_score_base value "master_score_base" value is used in "master_score" calculation: master_score = \$master_score_base + \$master_score_dev_factor * \$working_devs if set to bigger value in sg_persist resource configuration on some node, this node will be "preferred" for master role. base master_score value Working device factor in master_score calculation each "working" device provides additional value to "master_score", so the node that sees more devices will be preferred for the "Master"-role Setting it to 0 will disable this behavior. working device factor in master_score calculation master/slave decreases/increases its master_score after delay of \$master_score_delay seconds so if some device gets inaccessible, the slave decreases its master_score first and the resource will no be watched and after this device reappears again the master increases its master_score first this can work only if the master_score_delay is bigger then monitor interval on both master and slave Setting it to 0 will disable this behavior. master_score decrease/increase delay time END exit $OCF_SUCCESS } sg_persist_init() { if ! ocf_is_root ; then ocf_log err "You must be root to perform this operation." exit $OCF_ERR_PERM fi SG_PERSIST=${OCF_RESKEY_binary:-"$sg_persist_binary"} check_binary $SG_PERSIST ROLE=$OCF_RESKEY_CRM_meta_role NOW=$(date +%s) RESOURCE="${OCF_RESOURCE_INSTANCE}" - MASTER_SCORE_VAR_NAME="master-${OCF_RESOURCE_INSTANCE}" + MASTER_SCORE_VAR_NAME="master-${OCF_RESOURCE_INSTANCE//:/-}" PENDING_VAR_NAME="pending-$MASTER_SCORE_VAR_NAME" #only works with corocync CRM_NODE="${HA_SBIN_DIR}/crm_node" NODE_ID_DEC=$($CRM_NODE -i) - NODE=$($CRM_NODE -l | $GREP $NODE_ID_DEC) + NODE=$($CRM_NODE -l | $GREP -w ^$NODE_ID_DEC) NODE=${NODE#$NODE_ID_DEC } NODE=${NODE% *} MASTER_SCORE_ATTRIBUTE="${HA_SBIN_DIR}/crm_attribute --lifetime=reboot --name=$MASTER_SCORE_VAR_NAME --node=$NODE" CRM_MASTER="${HA_SBIN_DIR}/crm_master --lifetime=reboot" PENDING_ATTRIBUTE="${HA_SBIN_DIR}/crm_attribute --lifetime=reboot --name=$PENDING_VAR_NAME --node=$NODE" NODE_ID_HEX=$(printf '0x%x' $NODE_ID_DEC) if [ -z "$NODE_ID_HEX" ]; then ocf_log err "Couldn't get node id with \"$CRM_NODE\"" exit $OCF_ERR_INSTALLED fi ocf_log debug "$RESOURCE: NODE:$NODE, ROLE:$ROLE, NODE_ID DEC:$NODE_ID_DEC HEX:$NODE_ID_HEX" DEVS=${OCF_RESKEY_devs:=$devs} REQUIRED_DEVS_NOF=${OCF_RESKEY_required_devs_nof:=$required_devs_nof} RESERVATION_TYPE=${OCF_RESKEY_reservation_type:=$reservation_type} MASTER_SCORE_BASE=${OCF_RESKEY_master_score_base:=$master_score_base} MASTER_SCORE_DEV_FACTOR=${OCF_RESKEY_master_score_dev_factor:=$master_score_dev_factor} MASTER_SCORE_DELAY=${OCF_RESKEY_master_score_delay:=$master_score_delay} ocf_log debug "$RESOURCE: DEVS=$DEVS" ocf_log debug "$RESOURCE: REQUIRED_DEVS_NOF=$REQUIRED_DEVS_NOF" ocf_log debug "$RESOURCE: RESERVATION_TYPE=$RESERVATION_TYPE" ocf_log debug "$RESOURCE: MASTER_SCORE_BASE=$MASTER_SCORE_BASE" ocf_log debug "$RESOURCE: MASTER_SCORE_DEV_FACTOR=$MASTER_SCORE_DEV_FACTOR" ocf_log debug "$RESOURCE: MASTER_SCORE_DELAY=$MASTER_SCORE_DELAY" #expand path wildcards DEVS=$(echo $DEVS) if [ -z "$DEVS" ]; then ocf_log err "\"devs\" not defined" exit $OCF_ERR_INSTALLED fi sg_persist_check_devs sg_persist_get_status } sg_persist_action_usage() { cat <&1` + [ $? -eq 0 ] || continue + + WORKING_DEVS+=($dev) + + echo "$READ_KEYS" | $GREP -qw $NODE_ID_HEX\$ + [ $? -eq 0 ] || continue + + REGISTERED_DEVS+=($dev) + + READ_RESERVATION=`$SG_PERSIST --in --read-reservation $dev 2>&1` + [ $? -eq 0 ] || continue + + echo "$READ_RESERVATION" | $GREP -qw $NODE_ID_HEX\$ if [ $? -eq 0 ]; then - WORKING_DEVS+=($dev) - echo $READ_KEYS | $GREP $NODE_ID_HEX >/dev/null - if [ $? -eq 0 ]; then - REGISTERED_DEVS+=($dev) - - READ_RESERVATION=`$SG_PERSIST --in --read-reservation $dev 2>&1` - if [ $? -eq 0 ]; then - echo $READ_RESERVATION | $GREP $NODE_ID_HEX >/dev/null - if [ $? -eq 0 ]; then - RESERVED_DEVS+=($dev) - fi + RESERVED_DEVS+=($dev) + fi - reservation_key=`echo $READ_RESERVATION | $GREP -o 'Key=0x[0-9a-f]*' | $GREP -o '0x[0-9a-f]*'` - if [ -n "$reservation_key" ]; then - DEVS_WITH_RESERVATION+=($dev) - RESERVATION_KEYS+=($reservation_key) - fi - fi - fi + reservation_key=`echo $READ_RESERVATION | $GREP -o 'Key=0x[0-9a-f]*' | $GREP -o '0x[0-9a-f]*'` + if [ -n "$reservation_key" ]; then + DEVS_WITH_RESERVATION+=($dev) + RESERVATION_KEYS+=($reservation_key) fi done WORKING_DEVS_NOF=${#WORKING_DEVS[*]} ocf_log debug "$RESOURCE: working devices: `sg_persist_echo_array ${WORKING_DEVS[*]}`" ocf_log debug "$RESOURCE: number of working devices: $WORKING_DEVS_NOF" ocf_log debug "$RESOURCE: registered devices: `sg_persist_echo_array ${REGISTERED_DEVS[*]}`" ocf_log debug "$RESOURCE: reserved devices: `sg_persist_echo_array ${RESERVED_DEVS[*]}`" ocf_log debug "$RESOURCE: devices with reservation: `sg_persist_echo_array ${DEVS_WITH_RESERVATION[*]}`" ocf_log debug "$RESOURCE: reservation keys: `sg_persist_echo_array ${RESERVATION_KEYS[*]}`" MASTER_SCORE=$(($MASTER_SCORE_BASE + $MASTER_SCORE_DEV_FACTOR*$WORKING_DEVS_NOF)) ocf_log debug "$RESOURCE: master_score: $MASTER_SCORE_BASE + $MASTER_SCORE_DEV_FACTOR*$WORKING_DEVS_NOF = $MASTER_SCORE" } sg_persist_check_devs() { for dev in $DEVS do if [ -e "$dev" ]; then EXISTING_DEVS+=($dev) fi done EXISTING_DEVS_NOF=${#EXISTING_DEVS[*]} if [ $EXISTING_DEVS_NOF -lt $REQUIRED_DEVS_NOF ]; then ocf_log err "Number of existing devices=$EXISTING_DEVS_NOF less then required_devs_nof=$REQUIRED_DEVS_NOF" exit $OCF_ERR_INSTALLED fi } sg_persist_is_registered() { for registered_dev in ${REGISTERED_DEVS[*]} do if [ "$registered_dev" == "$1" ]; then return 0 fi done return 1 } sg_persist_get_reservation_key() { for array_index in ${!DEVS_WITH_RESERVATION[*]} do if [ "${DEVS_WITH_RESERVATION[$array_index]}" == "$1" ]; then echo ${RESERVATION_KEYS[$array_index]} return 0 fi done echo "" } sg_persist_echo_array() { str_count=0 arr_str="" for str in "$@" do arr_str="$arr_str[$str_count]:$str " str_count=$(($str_count+1)) done echo $arr_str } sg_persist_parse_act_pending() { ACT_PENDING_TS=0 ACT_PENDING_SCORE=0 if [ -n "$ACT_PENDING" ]; then ACT_PENDING_TS=${ACT_PENDING%%_*} ACT_PENDING_SCORE=${ACT_PENDING##*_} fi } sg_persist_clear_pending() { if [ -n "$ACT_PENDING" ]; then DO_PENDING_UPDATE="YES" NEW_PENDING="" fi } sg_persist_new_master_score() { DO_MASTER_SCORE_UPDATE="YES" NEW_MASTER_SCORE=$1 } sg_persist_new_pending() { DO_PENDING_UPDATE="YES" NEW_PENDING=$1 } # Functions invoked by resource manager actions sg_persist_action_start() { ocf_run $MASTER_SCORE_ATTRIBUTE --update=$MASTER_SCORE ocf_run $PENDING_ATTRIBUTE --update="" if [ $WORKING_DEVS_NOF -lt $REQUIRED_DEVS_NOF ]; then ocf_log err "$RESOURCE: Number of working devices=$WORKING_DEVS_NOF less then required_devs_nof=$REQUIRED_DEVS_NOF" exit $OCF_ERR_GENERIC fi for dev in ${WORKING_DEVS[*]} do if sg_persist_is_registered $dev ; then : OK else - ocf_run $SG_PERSIST --out --register --param-rk=0 --param-sark=$NODE_ID_HEX $dev + ocf_run $SG_PERSIST --out --no-inquiry --register --param-rk=0 --param-sark=$NODE_ID_HEX $dev if [ $? -ne $OCF_SUCCESS ] then return $OCF_ERR_GENERIC fi fi done return $OCF_SUCCESS } sg_persist_action_stop() { if [ ${#REGISTERED_DEVS[*]} -eq 0 ]; then ocf_log debug "$RESOURCE stop: already no registrations" else # Clear preference for becoming master ocf_run $MASTER_SCORE_ATTRIBUTE --delete ocf_run $PENDING_ATTRIBUTE --delete for dev in ${REGISTERED_DEVS[*]} do - ocf_run $SG_PERSIST --out --register --param-rk=$NODE_ID_HEX --param-sark=0 $dev + ocf_run $SG_PERSIST --out --no-inquiry --register --param-rk=$NODE_ID_HEX --param-sark=0 $dev done fi return $OCF_SUCCESS } sg_persist_action_monitor() { ACT_MASTER_SCORE=`$MASTER_SCORE_ATTRIBUTE --query --quiet 2>&1` ocf_log debug "$RESOURCE monitor: ACT_MASTER_SCORE=$ACT_MASTER_SCORE" ACT_PENDING=`$PENDING_ATTRIBUTE --query --quiet 2>&1` ocf_log debug "$RESOURCE monitor: ACT_PENDING=$ACT_PENDING" sg_persist_parse_act_pending ocf_log debug "$RESOURCE monitor: ACT_PENDING_TS=$ACT_PENDING_TS" ocf_log debug "$RESOURCE monitor: ACT_PENDING_VAL=$ACT_PENDING_SCORE" ocf_log debug "$MASTER_SCORE, $ACT_MASTER_SCORE, $ROLE" DO_MASTER_SCORE_UPDATE="NO" DO_PENDING_UPDATE="NO" if [ -n "$ACT_MASTER_SCORE" ] then if [ $ACT_MASTER_SCORE -eq $MASTER_SCORE ]; then sg_persist_clear_pending else case $ROLE in Master) if [ $MASTER_SCORE -lt $ACT_MASTER_SCORE ]; then if [ -n "$ACT_PENDING" ] then if [ $(($NOW-$ACT_PENDING_TS-$MASTER_SCORE_DELAY)) -ge 0 ]; then sg_persist_new_master_score $MASTER_SCORE sg_persist_clear_pending fi else if [ $MASTER_SCORE_DELAY -eq 0 ]; then sg_persist_new_master_score $MASTER_SCORE sg_persist_clear_pending else sg_persist_new_pending "${NOW}_${MASTER_SCORE}" fi fi else sg_persist_new_master_score $MASTER_SCORE sg_persist_clear_pending fi ;; Slave) if [ $MASTER_SCORE -gt $ACT_MASTER_SCORE ]; then if [ -n "$ACT_PENDING" ]; then if [ $(($NOW-$ACT_PENDING_TS-$MASTER_SCORE_DELAY)) -ge 0 ]; then sg_persist_new_master_score $MASTER_SCORE sg_persist_clear_pending fi else if [ $MASTER_SCORE_DELAY -eq 0 ]; then sg_persist_new_master_score $MASTER_SCORE sg_persist_clear_pending else sg_persist_new_pending "${NOW}_${MASTER_SCORE}" fi fi else sg_persist_new_master_score $MASTER_SCORE sg_persist_clear_pending fi ;; *) ;; esac fi fi if [ $DO_MASTER_SCORE_UPDATE == "YES" ]; then ocf_run $MASTER_SCORE_ATTRIBUTE --update=$NEW_MASTER_SCORE fi if [ $DO_PENDING_UPDATE == "YES" ]; then ocf_run $PENDING_ATTRIBUTE --update=$NEW_PENDING fi if [ ${#REGISTERED_DEVS[*]} -eq 0 ]; then ocf_log debug "$RESOURCE monitor: no registrations" return $OCF_NOT_RUNNING fi if [ ${#RESERVED_DEVS[*]} -eq ${#WORKING_DEVS[*]} ]; then return $OCF_RUNNING_MASTER fi if [ ${#REGISTERED_DEVS[*]} -eq ${#WORKING_DEVS[*]} ]; then if [ $RESERVATION_TYPE -eq 7 ] || [ $RESERVATION_TYPE -eq 8 ]; then if [ ${#DEVS_WITH_RESERVATION[*]} -gt 0 ]; then return $OCF_RUNNING_MASTER else return $OCF_SUCCESS fi else return $OCF_SUCCESS fi fi ocf_log err "$RESOURCE monitor: unexpected state" return $OCF_ERR_GENERIC } sg_persist_action_promote() { if [ ${#RESERVED_DEVS[*]} -gt 0 ]; then ocf_log info "$RESOURCE promote: already master" return $OCF_SUCCESS fi for dev in ${WORKING_DEVS[*]} do reservation_key=`sg_persist_get_reservation_key $dev` case $RESERVATION_TYPE in 1|3|5|6) if [ -z "$reservation_key" ]; then - ocf_run $SG_PERSIST --out --reserve --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev + ocf_run $SG_PERSIST --out --no-inquiry --reserve --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev if [ $? -ne $OCF_SUCCESS ]; then return $OCF_ERR_GENERIC fi else - ocf_run $SG_PERSIST --out --preempt --param-sark=$reservation_key --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev + ocf_run $SG_PERSIST --out --no-inquiry --preempt --param-sark=$reservation_key --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev if [ $? -ne $OCF_SUCCESS ]; then return $OCF_ERR_GENERIC fi fi ;; 7|8) if [ -z "$reservation_key" ]; then - ocf_run $SG_PERSIST --out --reserve --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev + ocf_run $SG_PERSIST --out --no-inquiry --reserve --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev if [ $? -ne $OCF_SUCCESS ] then return $OCF_ERR_GENERIC fi else ocf_log info "$RESOURCE promote: there already exist an reservation holder, all registrants become reservation holders" return $OCF_SUCCESS fi ;; *) return $OCF_ERR_ARGS ;; esac done return $OCF_SUCCESS } sg_persist_action_demote() { case $RESERVATION_TYPE in 1|3|5|6) if [ ${#RESERVED_DEVS[*]} -eq 0 ]; then ocf_log info "$RESOURCE demote: already slave" return $OCF_SUCCESS fi for dev in ${RESERVED_DEVS[*]} do - ocf_run $SG_PERSIST --out --release --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev + ocf_run $SG_PERSIST --out --no-inquiry --release --param-rk=$NODE_ID_HEX --prout-type=$RESERVATION_TYPE $dev if [ $? -ne $OCF_SUCCESS ]; then return $OCF_ERR_GENERIC fi done ;; 7|8) #in case of 7/8, --release won't release the reservation unless unregister the key. if [ ${#REGISTERED_DEVS[*]} -eq 0 ]; then ocf_log info "$RESOURCE demote: already slave" return $OCF_SUCCESS fi for dev in ${REGISTERED_DEVS[*]} do - ocf_run $SG_PERSIST --out --register --param-rk=$NODE_ID_HEX --param-sark=0 $dev + ocf_run $SG_PERSIST --out --no-inquiry --register --param-rk=$NODE_ID_HEX --param-sark=0 $dev if [ $? -ne $OCF_SUCCESS ]; then return $OCF_ERR_GENERIC fi done ;; *) return $OCF_ERR_ARGS ;; esac return $OCF_SUCCESS } sg_persist_action_notify() { local n_type="$OCF_RESKEY_CRM_meta_notify_type" local n_op="$OCF_RESKEY_CRM_meta_notify_operation" set -- $OCF_RESKEY_CRM_meta_notify_active_resource local n_active="$#" set -- $OCF_RESKEY_CRM_meta_notify_stop_resource local n_stop="$#" set -- $OCF_RESKEY_CRM_meta_notify_start_resource local n_start="$#" ocf_log debug "$RESOURCE notify: $n_type for $n_op - counts: active $n_active - starting $n_start - stopping $n_stop" return $OCF_SUCCESS } sg_persist_action_validate_all () { if [ "$OCF_RESKEY_CRM_meta_master_max" != "1" ] && [ "$RESERVATION_TYPE" != "7" ] && [ "$RESERVATION_TYPE" != "8" ]; then ocf_log err "Master options misconfigured." exit $OCF_ERR_CONFIGURED fi return $OCF_SUCCESS } if [ $# -ne 1 ]; then echo "Incorrect parameter count." sg_persist_action_usage exit $OCF_ERR_ARGS fi ACTION=$1 case $ACTION in meta-data) meta_data ;; validate-all) sg_persist_init sg_persist_action_validate_all ;; start|promote|monitor|stop|demote) ocf_log debug "$RESOURCE: starting action \"$ACTION\"" sg_persist_init sg_persist_action_$ACTION exit $? ;; notify) sg_persist_action_notify exit $? ;; usage|help) sg_persist_action_usage exit $OCF_SUCCESS ;; *) sg_persist_action_usage exit $OCF_ERR_ARGS ;; esac diff --git a/heartbeat/shellfuncs.in b/heartbeat/shellfuncs.in index 7786ec3d1..999162012 100644 --- a/heartbeat/shellfuncs.in +++ b/heartbeat/shellfuncs.in @@ -1,96 +1,96 @@ # Author: Alan Robertson # Support: linux-ha-dev@lists.tummy.com # License: GNU Lesser General Public License (LGPL) # # Set these variables if they're not already set... # : ${HA_SBIN_DIR:=@sbindir@} : ${HA_NOARCHBIN:=@datadir@/heartbeat} : ${OCF_AGENTS:=@OCF_RA_DIR@/heartbeat/} export HA_DIR HA_RCDIR HA_FIFO HA_BIN export HA_DEBUGLOG HA_LOGFILE HA_LOGFACILITY export HA_DATEFMT HA_RESOURCEDIR HA_DOCDIR export OCF_AGENTS PATH=$HA_BIN:${HA_SBIN_DIR}:${HA_NOARCHBIN}:$PATH PATH=`echo $PATH | sed -e 's%::%%' -e 's%:\.:%:%' -e 's%^:%%' -e 's%^\.:%%'` export PATH # A suitable echo command Echo() { echo "$@" } # copy stdin (text) to FIFO, with surrounding ">>>" and "<<<" marker lines. # no args.; no result # Notes: # o Using "cat -" rather than "cat" simply for clarity. # o The trailing "| cat -" tries to hold things together as a single # write (which is probably preferable behaviour in this context). ha_clustermsg() { (echo ">>>"; cat -; echo "<<<") | cat - >> $HA_FIFO } ha_parameter() { VALUE=`sed -e 's%[ ][ ]*% %' -e 's%^ %%' -e 's%#.*%%' $HA_CF | grep -i "^$1 " | sed 's%[^ ]* %%'` if [ "X$VALUE" = X ] then case $1 in keepalive) VALUE=2;; deadtime) ka=`ha_parameter keepalive` VALUE=`expr $ka '*' 2 '+' 1`;; esac fi Echo $VALUE } BSD_Status() { local base=${1##*/} local pid ret_status=`/bin/ps -ao pid,command | grep $base | sed 's/ .*//'` if [ "$ret_status" != "" ] then echo "${base} is running..." return 0 fi if [ -f $HA_VARRUN/${base}.pid ] then echo "${base} dead but pid file exists" return 1 fi if [ -f /var/run/${base}.pid ] then echo "${base} dead but pid file exists" return 1 fi if - [ -f $HA_VARLOCK/var/lock/subsys/${base}.pid ] + [ -f $HA_VARLOCK/${base}.pid ] then echo "${base} dead but lock file exists" return 2 fi if [ -f /var/spool/lock/${base} ] then echo "${base} dead but lock file exists" return 2 fi } # Now get the good stuff . @OCF_LIB_DIR@/heartbeat/ocf-shellfuncs diff --git a/heartbeat/symlink b/heartbeat/symlink index 1e36a9c74..dbf633efa 100755 --- a/heartbeat/symlink +++ b/heartbeat/symlink @@ -1,245 +1,245 @@ #!/bin/sh # # # An OCF RA that manages a symlink # # Copyright (c) 2011 Dominik Klein # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### meta_data() { cat < 1.1 This resource agent that manages a symbolic link (symlink). It is primarily intended to manage configuration files which should be enabled or disabled based on where the resource is running, such as cron job definitions and the like. Manages a symbolic link Full path of the symbolic link to be managed. This must obviously be in a filesystem that supports symbolic links. Full path of the symlink Full path to the link target (the file or directory which the symlink points to). Full path to the link target A suffix to append to any files that the resource agent moves out of the way because they clash with "link". If this is unset (the default), then the resource agent will simply refuse to create a symlink if it clashes with an existing file. Suffix to append to backup files END } symlink_monitor() { # This applies the following logic: # # * If $OCF_RESKEY_link does not exist, then the resource is # definitely stopped. # # * If $OCF_RESKEY_link exists and is a symlink that points to # ${OCF_RESKEY_target}, then the resource is definitely started. # # * If $OCF_RESKEY_link exists, but is anything other than a # symlink to ${OCF_RESKEY_target}, then the status depends on whether # ${OCF_RESKEY_backup_suffix} is set: # # - if ${OCF_RESKEY_backup_suffix} is set, then the resource is # simply not running. The existing file will be moved out of # the way, to ${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix}, # when the resource starts. # # - if ${OCF_RESKEY_backup_suffix} is not set, then an existing # file ${OCF_RESKEY_link} is an error condition, and the # resource can't start here. rc=$OCF_ERR_GENERIC # Using ls here instead of "test -e", as "test -e" returns false # if the file does exist, but it a symlink to a file that doesn't if ! ls "$OCF_RESKEY_link" >/dev/null 2>&1; then ocf_log debug "$OCF_RESKEY_link does not exist" rc=$OCF_NOT_RUNNING elif [ ! -L "$OCF_RESKEY_link" ]; then if [ -z "$OCF_RESKEY_backup_suffix" ]; then ocf_exit_reason "$OCF_RESKEY_link exists but is not a symbolic link!" exit $OCF_ERR_INSTALLED else ocf_log debug "$OCF_RESKEY_link exists but is not a symbolic link, will be moved to ${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix} on start" rc=$OCF_NOT_RUNNING fi - elif readlink -f "$OCF_RESKEY_link" | egrep -q "^${OCF_RESKEY_target}$"; then + elif readlink -m "$OCF_RESKEY_link" | egrep -q "^${OCF_RESKEY_target}$"; then ocf_log debug "$OCF_RESKEY_link exists and is a symbolic link to ${OCF_RESKEY_target}." rc=$OCF_SUCCESS else if [ -z "$OCF_RESKEY_backup_suffix" ]; then ocf_exit_reason "$OCF_RESKEY_link does not point to ${OCF_RESKEY_target}!" exit $OCF_ERR_INSTALLED else ocf_log debug "$OCF_RESKEY_link does not point to ${OCF_RESKEY_target}, will be moved to ${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix} on start" rc=$OCF_NOT_RUNNING fi fi return $rc } symlink_start() { if ! symlink_monitor; then if [ -e "$OCF_RESKEY_link" ]; then if [ -z "$OCF_RESKEY_backup_suffix" ]; then # Shouldn't happen, because symlink_monitor should # have errored out. But there is a chance that # something else put that file there after # symlink_monitor ran. ocf_exit_reason "$OCF_RESKEY_link exists and no backup_suffix is set, won't overwrite." exit $OCF_ERR_GENERIC else ocf_log debug "Found $OCF_RESKEY_link, moving to ${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix}" ocf_run mv -v "$OCF_RESKEY_link" "${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix}" \ || exit $OCF_ERR_GENERIC fi fi ocf_run ln -sv "$OCF_RESKEY_target" "$OCF_RESKEY_link" symlink_monitor return $? else return $OCF_SUCCESS fi } symlink_stop() { if symlink_monitor; then ocf_run rm -vf "$OCF_RESKEY_link" || exit $OCF_ERR_GENERIC if ! symlink_monitor; then if [ -e "${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix}" ]; then ocf_log debug "Found backup ${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix}, moving to $OCF_RESKEY_link" # if restoring the backup fails then still return with # $OCF_SUCCESS, but log a warning ocf_run -warn mv "${OCF_RESKEY_link}${OCF_RESKEY_backup_suffix}" "$OCF_RESKEY_link" fi return $OCF_SUCCESS else ocf_exit_reason "Removing $OCF_RESKEY_link failed." return $OCF_ERR_GENERIC fi else return $OCF_SUCCESS fi } symlink_validate_all() { if [ "x${OCF_RESKEY_link}" = "x" ]; then ocf_exit_reason "Mandatory parameter link is unset" exit $OCF_ERR_CONFIGURED fi if [ "x${OCF_RESKEY_target}" = "x" ]; then ocf_exit_reason "Mandatory parameter target is unset" exit $OCF_ERR_CONFIGURED fi # Having a non-existant target is technically not an error, as # symlinks are allowed to point to non-existant paths. But it # still doesn't hurt to warn people if the target does not exist # (but only during non-probes). if [ ! -e "${OCF_RESKEY_target}" ]; then ocf_log warn "${OCF_RESKEY_target} does not exist!" fi } symlink_usage() { cat <192.168.6.7:80 gate # real=192.168.6.32:80 gate # fallback=127.0.0.1:80 gate # service=http # scheduler=rr # #persistent=600 # #netmask=255.255.255.255 # protocol=tcp # checktype=negotiate # checkport=80 # request="index.html" # receive="Test Page" # virtualhost=x.y.z #Sample configuration for an https virtual service. #Fallback setting overrides global #virtual=192.168.6.240:443 # real=192.168.16.3:443 masq # real=192.168.16.5:443 masq # fallback=127.0.0.1:443 # service=https # scheduler=rr # #persistent=600 # #netmask=255.255.255.255 # protocol=tcp # checktype=negotiate # checkport=443 # request="index.html" # receive="Test Page" # virtualhost=x.y.z #Sample configuration for an ftp virtual service. #Fallback setting overrides global #virtual=192.168.6.240:21 # real=192.168.16.3:21 masq # real=192.168.16.5:21 masq # fallback=127.0.0.1:21 # service=ftp # checkport=21 # scheduler=wlc # scheduler=rr # #persistent=600 # #netmask=255.255.255.255 # protocol=tcp # checktype=negotiate # login="anonymous" # passwd="ldirectord@localhost" # request="welcome.msg" # receive="test" #Sample configuration for an smtp virtual service. #Fallback setting overrides global #virtual=192.168.6.240:25 # real=192.168.16.3:25 masq # real=192.168.16.5:25 masq # fallback=127.0.0.1:25 # service=smtp # scheduler=wlc # protocol=tcp # persistent=600 # #netmask=255.255.255.255 # checktype=negotiate # checkport=25 #Sample configuration for an submission virtual service. #Fallback setting overrides global #virtual=192.168.6.240:587 # real=192.168.16.3:587 masq # real=192.168.16.5:587 masq # fallback=127.0.0.1:587 # service=submission # scheduler=wlc # protocol=tcp # persistent=600 # #netmask=255.255.255.255 # checktype=negotiate # checkport=587 #Sample configuration for a pop virtual service. #Fallback setting overrides global #virtual=192.168.6.240:110 # real=192.168.16.3:110 masq # real=192.168.16.5:110 masq # fallback=127.0.0.1:110 # service=pop # scheduler=wlc # persistent=600 # #netmask=255.255.255.255 # protocol=tcp # checktype=negotiate # checkport=110 # #login="test" # #passwd="test" ##Sample configuration for an imap virtual service. #Fallback setting overrides global #virtual=192.168.6.240:143 # real=127.0.0.1:143 masq # real=192.168.16.3:143 masq # real=192.168.16.5:143 masq # fallback=127.0.0.1:143 # service=imap # scheduler=wlc # #persistent=600 # #netmask=255.255.255.255 # protocol=tcp # checktype=negotiate # checkport=143 # #login="test" # #passwd="test" #Sample configuration for an ldap virtual service. #Fallback setting overrides global #virtual=192.168.84.5:389 # real=10.0.1.4:389 masq # real=10.0.1.6:389 masq # fallback=127.0.0.1:389 # service=ldap # scheduler=wlc # #persistent=600 # #netmask=255.255.255.255 # protocol=tcp # checktype=negotiate # checkport=369 # request="dc=upmc, dc=fr" # receive="dc=upmc, dc=fr" # #login="test" # #passwd="test" #Sample configuration for an nntp virtual service. #Fallback setting overrides global #virtual=192.168.84.5:119 # real=10.0.1.4:119 masq # real=10.0.1.6:119 masq # fallback=127.0.0.1:119 # service=nntp # scheduler=wlc # #persistent=600 # #netmask=255.255.255.255 # protocol=tcp # checktype=negotiate # checkport=119 #Sample configuration for a UDP DNS virtual service. #Fallback setting overrides global #virtual=192.168.84.5:53 # real=10.0.1.4:53 masq # real=10.0.1.6:53 masq # fallback=127.0.0.1:53 # service=dns # scheduler=wlc # #persistent=600 # #netmask=255.255.255.255 # protocol=udp # checktype=negotiate # checkport=53 # request="x.y.z" # receive="127.0.0.1" #Sample configuration for a MySQL virtual service. #virtual = 192.168.10.74:3306 # real=sql01->sql03:3306 gate 10 # fallback=127.0.0.1:3306 # service=mysql # scheduler=wrr # #persistent=600 # #netmask=255.255.255.255 # protocol=tcp # checktype=negotiate # login="readuser" # passwd="genericpassword" # database="portal" # request="SELECT * FROM link" #Sample configuration for a PostgreSQL virtual service. #virtual = 192.168.10.74:5432 # real=sql01->sql03:5432 gate 10 # fallback=127.0.0.1:5432 # service=pgsql # scheduler=wrr # #persistent=600 # #netmask=255.255.255.255 # protocol=tcp # checktype=negotiate # login="readuser" # passwd="genericpassword" # database="portal" # request="SELECT * FROM link" #Sample configuration for a Oracle virtual service. #virtual = 192.168.10.74:1521 # real=sql01->sql03:1521 gate 10 # fallback=127.0.0.1:1521 # service=oracle # scheduler=wrr # #persistent=600 # #netmask=255.255.255.255 # protocol=tcp # checktype=negotiate # login="readuser" # passwd="genericpassword" # database="portal" # request="SELECT * FROM link" #Sample configuration for an unsuported protocol #The real servers will just be brought up without checking for availability #virtual=192.168.6.240:23 # real=192.168.16.3:23 masq # real=192.168.16.5:23 masq # fallback=127.0.0.1:23 # service=none # scheduler=wlc # persistent=600 # #netmask=255.255.255.255 # protocol=tcp # checktype=on # A sample virtual services that uses a ping check. # Note that using checktype=connect and protocol=udp # will also effect ping checks #virtual=192.168.6.240:53 # real=192.168.6.2:53 gate # real=192.168.6.3:53 gate # real=192.168.6.6:53 gate # fallback=127.0.0.1:53 gate # scheduler=rr # #persistent=600 # #netmask=255.255.255.255 # protocol=udp # checktype=ping # failurecount=3 # A sample virtual services that uses a Radius check on UDP. # Note that using checktype=connect and protocol=udp # will also effect ping checks #virtual=192.168.6.240:1812 # real=192.168.6.2:1812 gate # real=192.168.6.3:1812 gate # real=192.168.6.6:1812 gate # fallback=127.0.0.1:1812 gate # scheduler=rr # #persistent=600 # #netmask=255.255.255.255 # protocol=udp # checktype=negotiate # service=radius # login="readuser" # passwd="genericpassword" # secret="somesecret" # checktimeout=1 # A sample virtual services that uses a SIP check on UDP. # Note that using checktype=connect and protocol=udp # will also effect ping checks #virtual=192.168.6.240:5060 # real=192.168.6.2::5060 gate # real=192.168.6.3::5060 gate # real=192.168.6.6::5060 gate # fallback=127.0.0.1:5060 gate # scheduler=rr # #persistent=600 # #netmask=255.255.255.255 # protocol=udp # checktype=negotiate # service=sip # checktimeout=1 #Sample configuration for an nntp virtual service with IPv6. #Fallback setting overrides global #virtual6=[2001:db8::5]:119 # real6=[2001:db8:0:1::4]:119 masq # real6=[2001:db8:0:1::6]:119 masq # fallback6=[::1]:119 # service=nntp # scheduler=wlc # #persistent=600 # #netmask=255.255.255.255 # protocol=tcp # checktype=negotiate # checkport=119 diff --git a/ldirectord/ldirectord.in b/ldirectord/ldirectord.in old mode 100644 new mode 100755 index 44b7d6b85..628f1c3e3 --- a/ldirectord/ldirectord.in +++ b/ldirectord/ldirectord.in @@ -1,5297 +1,5371 @@ #!/usr/bin/perl -w ###################################################################### # ldirectord http://www.vergenet.net/linux/ldirectord/ # Linux Director Daemon - run "perldoc ldirectord" for details # # 1999-2006 (C) Jacob Rief , # Horms and others # # License: GNU General Public License (GPL) # # Note: * The original author of this software was Jacob Rief circa 1999 # * It was maintained by Jacob Rief and Horms # from November 1999 to July 2003. # * From July 2003 Horms is the maintainer # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA # 02111-1307 USA # ###################################################################### # A Brief history of versions: # # From oldest to newest # 1.1-1.144: ldirecord maintained in CVS HEAD branch # 1.145-1.186: ldirectord.in maintained in CVS HEAD BRANCH # 1.186-ha-VERSION: ldirectord.in maintained in mercurial =head1 NAME ldirectord - Linux Director Daemon Daemon to monitor remote services and control Linux Virtual Server =head1 SYNOPSIS B [B<-d|--debug>] [--] [I] B | B | B | B | B | B | B B [B<-h|-?|--help|-v|--version>] =head1 DESCRIPTION B is a daemon to monitor and administer real servers in a cluster of load balanced virtual servers. B typically is started from heartbeat but can also be run from the command line. On startup B reads the file B<@sysconfdir@/ha.d/conf/>I. After parsing the file, entries for virtual servers are created on the LVS. Now at regular intervals the specified real servers are monitored and if they are considered alive, added to a list for each virtual server. If a real server fails, it is removed from that list. Only one instance of B can be started for each configuration, but more instances of B may be started for different configurations. This helps to group clusters of services. Normally one would put an entry inside B<@sysconfdir@/ha.d/haresources> I to start ldirectord from heartbeat. =head1 OPTIONS I: This is the name for the configuration as specified in the file B<@sysconfdir@/ha.d/conf/>I B<-d|--debug> Don't start as daemon and log verbosely. B<-h|--help> Print user manual and exit. B<-v|--version> Print version and exit. B the daemon for the specified configuration. B the daemon for the specified configuration. This is the same as sending a TERM signal to the running daemon. B the daemon for the specified configuration. The same as stopping and starting. B the configuration file. This is only useful for modifications inside a virtual server entry. It will have no effect on adding or removing a virtual server block. This is the same as sending a HUP signal to the running daemon. B of the running daemon for the specified configuration. =head1 SYNTAX =head2 Description of how to write configuration files BI<(ip_address|hostname:portnumber|servicename)|firewall-mark> Defines a virtual service by IP-address (or hostname) and port (or servicename) or firewall-mark. A firewall-mark is an integer greater than zero. The configuration of marking packets is controlled using the C<-m> option to B(8). All real services and flags for a virtual service must follow this line immediately and be indented. BI Timeout in seconds for connect, external, external-perl and ping checks. If the timeout is exceeded then the real server is declared dead. If defined in a virtual server section then the global value is overridden. If undefined then the value of negotiatetimeout is used. negotiatetimeout is also a global value that may be overridden by a per-virtual setting. If both checktimeout and negotiatetimeout are unset, the default is used. Default: 5 seconds BI Timeout in seconds for negotiate checks. If defined in a virtual server section then the global value is overridden. If undefined then the value of checktimeout is used. checktimeout is also a global value that may be overridden by a per-virtual setting. If both negotiatetimeout and checktimeout are unset, the default is used. Default: 30 seconds BI Defines the number of second between server checks. When fork=no this option defines the amount of time ldirectord sleeps between running all of the realserver checks in all virtual service pools. When fork=yes this option defines the amount of time each forked child sleeps per virtual service pool after running all realserver checks for that pool. If set in the virtual server section then the global value is overridden, but ONLY if using forking mode (BI). Default: 10 seconds BI This option is deprecated and slated for removal in a future version. Please see the 'failurecount' option. The number of times a check will be attempted before it is considered to have failed. Only works with ping checks. Note that the checktimeout/negotiatetimeout is additive, so if a connect check is used, checkcount is 3 and checktimeout is 2 seconds, then a total of 6 seconds worth of timeout will occur before the check fails. If defined in a virtual server section then the global value is overridden. Default: 1 BI The number of consecutive times a failure will have to be reported by a check before the realserver is considered to have failed. A value of 1 will have the realserver considered failed on the first failure. A successful check will reset the failure counter to 0. If defined in a virtual server section then the global value is overridden. Default: 1 BB | B Defines if should continuously check the configuration file for modification. If this is set to 'yes' and the configuration file changed on disk and its modification time (mtime) is newer than the previous version, the configuration is automatically reloaded. Default: no BIB<"> If this directive is defined, B automatically calls the executable I after the configuration file has changed on disk. This is useful to update the configuration file through B on the other heartbeated host. The first argument to the callback is the name of the configuration. This directive might also be used to restart B automatically after the configuration file changed on disk. However, if B is set to yes, the configuration is reloaded anyway. BI [B | B | B] the server onto which a webservice is redirected if all real servers are down. Typically this would be 127.0.0.1 with an emergency page. If defined in a virtual server section then the global value is overridden. BIB<"> If this directive is defined, the supplied script is executed whenever all real servers for a virtual service are down or when the first real server comes up again. In the first case, it is called with "start" as its first argument, in the latter with "stop". Additional parameters are vserver with vport (vserver:vport) as second param and protocol (tcp/udp) as third param to identify the virtual service within the fallback script. If defined in a virtual server section then the global value is overridden. BIB<">|syslog_facility An alternative logfile might be specified with this directive. If the logfile does not have a leading '/', it is assumed to be a syslog(3) facility name. Default: log directly to the file I. BI[, I]...B<"> A valid email address for sending alerts about the changed connection status to any real server defined in the virtual service. This option requires perl module MailTools to be installed. Automatically tries to send email using any of the built-in methods. See perldoc Mail::Mailer for more info on methods. Multiple addresses may be supplied, comma delimited. If defined in a virtual server section then the global value is overridden. BI A valid email address to use as the from address of the email alerts. You can use a plain email address or any RFC-compliant string for the From header in the body of an email message (such as: "ldirectord Alerts" ) Do not quote this string unless you want the quotes passed in as part of the From header. Default: unset, take system generated default (probably root@hostname) B I Delay in seconds between repeating email alerts while any given real server in the virtual service remains inaccessible. A setting of zero seconds will inhibit the repeating alerts. The email timing accuracy of this setting is dependent on the number of seconds defined in the checkinterval configuration option. If defined in a virtual server section then the global value is overridden. Default: 0 BB | B | B | B | B | B,... Comma delimited list of server states in which email alerts should be sent. B is a short-hand for "B,B,B,B". If B is specified, no other option may be specified, otherwise options are ored with each other. If defined in a virtual server section then the global value is overridden. Default: all BIB<"> A valid SMTP server address to use for sending email via SMTP. If defined in a virtual server section then the global value is overridden. BIB<"> Use this directive to start an instance of ldirectord for the named I. BB | B If I, then ldirectord does not go into background mode. All log-messages are redirected to stdout instead of a logfile. This is useful to run B supervised from daemontools. See http://untroubled.org/rpms/daemontools/ or http://cr.yp.to/daemontools.html for details. Default: I BB | B If I, then ldirectord will spawn a child process for every virtual server, and run checks against the real servers from them. This will increase response times to changes in real server status in configurations with many virtual servers. This may also use less memory then running many separate instances of ldirectord. Child processes will be automatically restarted if they die. Default: I BB | B If I, then when real or failback servers are determined to be down, they are not actually removed from the kernel's LVS table. Rather, their weight is set to zero which means that no new connections will be accepted. This has the side effect, that if the real server has persistent connections, new connections from any existing clients will continue to be routed to the real server, until the persistent timeout can expire. See L for more information on persistent connections. This side-effect can be avoided by running the following: echo 1 > /proc/sys/net/ipv4/vs/expire_quiescent_template If the proc file isn't present this probably means that the kernel doesn't have LVS support, LVS support isn't loaded, or the kernel is too old to have the proc file. Running ipvsadm as root should load LVS into the kernel if it is possible. If I, then the real or failback servers will be removed from the kernel's LVS table. The default is I. If defined in a virtual server section then the global value is overridden. Default: I BB | B If I, then when real or failback servers are determined to be down, they are readded to the kernel's LVS table with weight 0 if they do not exist in the table. Setting the value to no, allows manually removing the realserver to manually disable all persistent connections. BB | B If I, then when ldirectord exits it will remove all of the virtual server pools that it is managing from the kernel's LVS table. If I, then the virtual server pools it is managing and any real or failback servers listed in them at the time ldirectord exits will be left as-is. If you want to be able to stop ldirectord without having traffic to your realservers interrupted you will want to set this to I. If defined in a virtual server section then the global value is overridden. Default: I BI If this option is set ldirectord will look for a special file in the specified directory and, if found, force the status of the real server identified by the file to down, skipping the normal health check. This would be useful if you wish to force servers down for maintenance without having to modify the actual ldirectord configuration file. For example, given a realserver with IP 172.16.1.2, service on port 4444, and a resolvable reverse DNS entry pointing to "realserver2.example.com" ldirectord will check for the existence of the following files: =over =item 172.16.1.2:4444 =item 172.16.1.2 =item realserver2.example.com:4444 =item realserver2.example.com =item realserver2:4444 =item realserver2 =back If any one of those files is found then ldirectord will immediately force the status of the server to down as if the check had failed. Note: Since it checks for the IP/hostname without the port this means you can decide to place an entire realserver into maintenance across a large number of virtual service pools with a single file (if you were going to reboot the server, for instance) or include the port number and put just a particular service into maintenance. This option is not valid in a virtual server section. Default: disabled =head2 Section virtual The following commands must follow a B entry and must be indented with a minimum of 4 spaces or one tab. B Iip_address|hostname][:portnumber|servicename>] B | B | B [I] [B<">IB<", ">IB<">] Defines a real service by IP-address (or hostname) and port (or servicename). If the port is omitted then a 0 will be used, this is intended primarily for fwmark services where the port for real servers is ignored. Optionally a range of IPv4 addresses (or two hostnames) may be given, in which case each IPv4 address in the range will be treated as a real server using the given port. The second argument defines the forwarding method, must be B, B or B. The third argument is optional and defines the weight for that real server. If omitted then a weight of 1 will be used. The last two arguments are also optional. They define a request-receive pair to be used to check if a server is alive. They override the request-receive pair in the virtual server section. These two strings must be quoted. If the request string starts with I the IP-address and port of the real server is overridden, otherwise the IP-address and port of the real server is used. =head2 For TCP and UDP (non fwmark) virtual services, unless the forwarding method is B and the IP address of a real server is non-local (not present on a interface on the host running ldirectord) then the port of the real server will be set to that of its virtual service. That is, port-mapping is only available to if the real server is another machine and the forwarding method is B. This is due to the way that the underlying LVS code in the kernel functions. =head2 More than one of these entries may be inside a virtual section. The checktimeout, negotiatetimeout, checkcount, fallback, emailalert, emailalertfreq and quiescent options listed above may also appear inside a virtual section, in which case the global setting is overridden. BB | B | B | B | B | B | B | BI Type of check to perform. Negotiate sends a request and matches a receive string. Connect only attempts to make a TCP/IP connection, thus the request and receive strings may be omitted. If checktype is a number then negotiate and connect is combined so that after each N connect attempts one negotiate attempt is performed. This is useful to check often if a service answers and in much longer intervals a negotiating check is done. Ping means that ICMP ping will be used to test the availability of real servers. Ping is also used as the connect check for UDP services. Off means no checking will take place and no real or fallback servers will be activated. On means no checking will take place and real servers will always be activated. Default is I. BB | B | B | B | B | B | B | B | B | B | B | B | B | B | B | B | B | B | B | B The type of service to monitor when using checktype=negotiate. None denotes a service that will not be monitored. simpletcp sends the B string to the server and tests it against the B regexp. The other types of checks connect to the server using the specified protocol. Please see the B and B sections for protocol specific information. Default: =over 4 =item * Virtual server port is 21: ftp =item * Virtual server port is 25: smtp =item * Virtual server port is 53: dns =item * Virtual server port is 80: http =item * Virtual server port is 110: pop =item * Virtual server port is 119: nntp =item * Virtual server port is 143: imap =item * Virtual server port is 389: ldap =item * Virtual server port is 443: https =item * Virtual server port is 587: submission =item * Virtual server port is 993: imaps =item * Virtual server port is 995: pops =item * Virtual server port is 1521: oracle =item * Virtual server port is 1812: radius =item * Virtual server port is 3128: http_proxy =item * Virtual server port is 3306: mysql =item * Virtual server port is 5432: pgsql =item * Virtual server port is 5060: sip =item * Otherwise: none =back BIB<"> This setting is used if checktype is external or external-perl and is the command to be run to check the status of a real server. It should exit with status 0 if everything is ok, or non-zero otherwise. Four parameters are passed to the script: =over 4 =item * virtual server ip/firewall mark =item * virtual server port =item * real server ip =item * real server port =back If the checktype is external-perl then the command is assumed to be a Perl script and it is evaluated into an anonymous subroutine which is called at check time, avoiding a fork-exec. The argument signature and exit code conventions are identical to checktype external. That is, an external-perl checktype should also work as an external checktype. Default: /bin/true BI Number of port to monitor. Sometimes check port differs from service port. Default: port specified for each real server BIB<"> This object will be requested each checkinterval seconds on each real server. The string must be inside quotes. Note that this string may be overridden by an optional per real-server based request-string. For an HTTP/HTTPS check, this should be a relative URI, while it has to be absolute for the 'http_proxy' check type. In the latter case, this URI will be requested through the proxy backend that is being checked. For a DNS check this should the name of an A record, or the address of a PTR record to look up. For a MySQL, Oracle or PostgeSQL check, this should be an SQL SELECT query. The data returned is not checked, only that the answer is one or more rows. This is a required setting. For a simpletcp check, this string is sent verbatim except any occurrences of \n are replaced with a new line character. BIB<"> If the requested result contains this I, the real server is declared alive. The regexp must be inside quotes. Keep in mind that regexps are not plain strings and that you need to escape the special characters if they should as literals. Note that this regexp may be overridden by an optional per real-server based receive regexp. For a DNS check this should be any one the A record's addresses or any one of the PTR record's names. In case of dynamic DNS answers (different answers on the same question) a regex to match multiple addresses or PTR record names could also defined. For a MySQL check, the receive setting is not used. B | B Sets the HTTP method which should be used to fetch the URI specified in the request-string. GET is the method used by default if the parameter is not set. If HEAD is used, the receive-string should be unset. Default: GET BIB<"> Used when using a negotiate check with HTTP or HTTPS. Sets the host header used in the HTTP request. In the case of HTTPS this generally needs to match the common name of the SSL certificate. If not set then the host header will be derived from the request url for the real server if present. As a last resort the IP address of the real server will be used. BIB<"> For FTP, IMAP, LDAP, MySQL, Oracle, POP and PostgreSQL, the username used to log in. -For Radius the username is used for the attribute User-Name. +For RADIUS the username is used for the attribute User-Name. For SIP, the username is used as both the to and from address for an OPTIONS query. Default: =over 4 =item * FTP: Anonymous =item * MySQL Oracle, and PostgreSQL: Must be specified in the configuration =item * SIP: ldirectord\@, hostname is derived as per the passwd option below. =item * Otherwise: empty string, which denotes that case authentication will not be attempted. =back BIB<"> Password to use to login to FTP, IMAP, LDAP, MySQL, Oracle, POP, PostgreSQL and SIP servers. -For Radius the passwd is used for the attribute User-Password. +For RADIUS the passwd is used for the attribute User-Password. Default: =over 4 =item * FTP: ldirectord\@, where hostname is the environment variable HOSTNAME evaluated at run time, or sourced from uname if unset. =item * Otherwise: empty string. In the case of LDAP, MySQL, Oracle, and PostgreSQL this means that authentication will not be performed. =back BIB<"> Database to use for MySQL, Oracle and PostgreSQL servers, this is the database that the query (set by B above) will be performed against. This is a required setting. BIB<"> -Secret to use for Radius servers, this is the secret used to perform an +Secret to use for RADIUS servers, this is the secret used to perform an Access-Request with the username (set by B above) and passwd (set by B above). Default: empty string B I Scheduler to be used by LVS for loadbalancing. For an information on the available sehedulers please see the ipvsadm(8) man page. Default: "wrr" B I Number of seconds for persistent client connections. B I | I Netmask to be used for granularity of persistent client connections. IPv4 netmask should be specified in dotted quad notation. IPv6 netmask should be specified as a prefix length between 1 and 128. B | B | B Protocol to be used. If the virtual is specified as an IP address and port then it must be one of tcp or udp. If a firewall mark then the protocol must be fwm. Default: =over 4 =item * Virtual is an IP address and port, and the port is not 53: tcp =item * Virtual is an IP address and port, and the port is 53: udp =item * Virtual is a firewall mark: fwm =back BIB<"> File to continuously log the real service checks to for this virtual service. This is useful for monitoring when and why real services were down or for statistics. The log format is: [timestamp|pid|real_service_id|status|message] Default: no separate logging of service checks. BB | B Specify that a virtual service uses one-packet scheduling. This option can be used only for UDP services. If this option is specified, all connections are created only to schedule one packet. Option is useful to schedule UDP packets from same client port to different real servers. +BI + +A name for this service. This is for the sole purpose of making it easier +to know which service is affected when e-mail notifications are sent out. +It will be included in the e-mail subject and body. + +BI + +Notes about this service to be included in e-mail notifications (for example, +purpose of the service or relevant administrator to contact). + =head1 IPv6 Directives for IPv6 are virtual6, real6, fallback6. IPv6 addresses specified for virtual6, real6, fallback6 and a file of maintenance directory should be enclosed by brackets ([2001:db8::abcd]:80). Following checktype and service are supported. BB | B | B | B | B | B | BI BB | B | B | B | B | B | B Note: When using a service type with http or https, you need to install perl module perl-Net-INET6Glue. =head1 FILES B<@sysconfdir@/ha.d/ldirectord.cf> B BIB<.pid> B =head1 SEE ALSO L, L Ldirectord Web Page: http://www.vergenet.net/linux/ldirectord/ =head1 AUTHORS Horms Jacob Rief =cut use strict; # Set defaults for configuration variables in the "set_defaults" function use vars qw( $VERSION_STR $AUTOCHECK $CHECKINTERVAL $LDIRECTORD $LDIRLOG $NEGOTIATETIMEOUT $DEFAULT_NEGOTIATETIMEOUT $RUNPID $CHECKTIMEOUT $DEFAULT_CHECKTIMEOUT $CHECKCOUNT $FAILURECOUNT $QUIESCENT $READDQUIESCENT $FORKING $EMAILALERT $EMAILALERTFREQ $EMAILALERTSTATUS $EMAILALERTFROM $SMTP $CLEANSTOP $MAINTDIR $CALLBACK $CFGNAME $CMD $CONFIG $DEBUG $FALLBACK $FALLBACK6 $FALLBACKCOMMAND $SUPERVISED $IPVSADM $checksum $DAEMON_STATUS $DAEMON_STATUS_STARTING $DAEMON_STATUS_RUNNING $DAEMON_STATUS_STOPPING $DAEMON_STATUS_RELOADING $DAEMON_STATUS_ALL $DAEMON_TERM $DAEMON_HUP $DAEMON_CHLD $opt_d $opt_h $stattime %LD_INSTANCE @OLDVIRTUAL @REAL @VIRTUAL $HOSTNAME %EMAILSTATUS %FORK_CHILDREN $SERVICE_UP $SERVICE_DOWN %check_external_perl__funcs $CRLF ); $VERSION_STR = "Linux Director v1.186-ha"; $DAEMON_STATUS_STARTING = 0x1; $DAEMON_STATUS_RUNNING = 0x2; $DAEMON_STATUS_STOPPING = 0x4; $DAEMON_STATUS_RELOADING = 0x8; $DAEMON_STATUS_ALL = $DAEMON_STATUS_STARTING | $DAEMON_STATUS_RUNNING | $DAEMON_STATUS_STOPPING | $DAEMON_STATUS_RELOADING; $SERVICE_UP = 0; $SERVICE_DOWN =1; # default values $DAEMON_TERM = undef; $DAEMON_HUP = undef; $LDIRECTORD = ld_find_cmd("ldirectord", 1); if (! defined $LDIRECTORD) { $LDIRECTORD = "@sbindir@/ldirectord"; } $RUNPID = "/var/run/ldirectord"; $CRLF = "\x0d\x0a"; # Set global configuration default values: set_defaults(); use Getopt::Long; use Pod::Usage; #use English; #use Time::HiRes qw( gettimeofday tv_interval ); use Socket; use Socket6 qw(NI_NUMERICHOST NI_NUMERICSERV NI_NAMEREQD getaddrinfo getnameinfo inet_pton inet_ntop); # Workaround warnning messages : Three "_in6" symbols redefined. eval "use Socket6 qw(pack_sockaddr_in6)" unless defined &pack_sockaddr_in6; eval "use Socket6 qw(sockaddr_in6)" unless defined &sockaddr_in6; eval "use Socket6 qw(unpack_sockaddr_in6)" unless defined &unpack_sockaddr_in6; use Sys::Hostname; use POSIX qw(setsid :sys_wait_h); use Sys::Syslog qw(:DEFAULT setlogsock); BEGIN { # wrap exit() to preserve replacability *CORE::GLOBAL::exit = sub { CORE::exit(@_ ? shift : 0); }; } # command line options my @OLD_ARGV = @ARGV; my $opt_d = ''; my $opt_h = ''; my $opt_v = ''; Getopt::Long::Configure ("bundling", "no_auto_abbrev", "require_order"); GetOptions("debug|d" => \$opt_d, "help|h|?" => \$opt_h, "version|v" => \$opt_v) or usage(); # main code $DEBUG = $opt_d ? 3 : 0; if ($opt_h) { exec_wrapper("/usr/bin/perldoc -U $LDIRECTORD"); &ld_exit(127, "Exec failed"); } if ($opt_v) { print("$VERSION_STR\n" . "1999-2006 Jacob Rief, Horms and others\n" . "\n". "\n" . "ldirectord comes with ABSOLUTELY NO WARRANTY.\n" . "This is free software, and you are welcome to redistribute it\n". "under certain conditions. " . "See the GNU General Public Licence for details.\n"); &ld_exit(0, ""); } if ($DEBUG>0 and -f "./ipvsadm") { $IPVSADM="./ipvsadm"; } else { if (-x "/sbin/ipvsadm") { $IPVSADM="/sbin/ipvsadm"; } elsif (-x "/usr/sbin/ipvsadm") { $IPVSADM="/usr/sbin/ipvsadm"; } else { die "Can not find ipvsadm"; } } # There is a memory leak in perl's socket code when # the default IO layer is used. So use "perlio" unless # something else has been explicitly set. # http://archive.develooper.com/perl5-porters@perl.org/msg85468.html unless(defined($ENV{'PERLIO'})) { $ENV{'PERLIO'} = "perlio"; exec_wrapper($0, @OLD_ARGV); } $DAEMON_STATUS = $DAEMON_STATUS_STARTING; ld_init(); ld_setup(); ld_start(); ld_cmd_children("start", %LD_INSTANCE); $DAEMON_STATUS = $DAEMON_STATUS_RUNNING; ld_main(); &ld_rm_file("$RUNPID.$CFGNAME.pid"); &ld_exit(0, "Reached end of \"main\""); # functions sub ld_init { # install signal handlers (this covers TERM) #require Net::LDAP; $SIG{'INT'} = \&ld_handler_term; $SIG{'QUIT'} = \&ld_handler_term; $SIG{'ILL'} = \&ld_handler_term; $SIG{'ABRT'} = \&ld_handler_term; $SIG{'FPE'} = \&ld_handler_term; $SIG{'SEGV'} = \&ld_handler_term; $SIG{'TERM'} = \&ld_handler_term; $SIG{'BUS'} = \&ld_handler_term; $SIG{'SYS'} = \&ld_handler_term; $SIG{'XCPU'} = \&ld_handler_term; $SIG{'XFSZ'} = \&ld_handler_term; $SIG{'IOT'} = \&ld_handler_term; # This used to call a signal handler, that logged a message # However, this typically goes to syslog and if syslog # is playing up a loop will occur. $SIG{'PIPE'} = "IGNORE"; # HUP is actually used $SIG{'HUP'} = \&ld_handler_hup; # Reap Children $SIG{'CHLD'} = \&ld_handler_chld; if (defined $ENV{HOSTNAME}) { $HOSTNAME = "$ENV{HOSTNAME}"; } else { use POSIX "uname"; my ($s, $n, $r, $v, $m) = uname; $HOSTNAME = $n; } # search for the correct configuration file if ( !defined $ARGV[0] ) { usage(); } if ( defined $ARGV[0] && defined $ARGV[1] ) { $CONFIG = $ARGV[0]; if ($CONFIG =~ /([^\/]+)$/) { $CFGNAME = $1; } $CMD = $ARGV[1]; } elsif ( defined $ARGV[0] ) { $CONFIG = "ldirectord.cf"; $CFGNAME = "ldirectord"; $CMD = $ARGV[0]; } if ( $CMD ne "start" and $CMD ne "stop" and $CMD ne "status" and $CMD ne "restart" and $CMD ne "try-restart" and $CMD ne "reload" and $CMD ne "force-reload") { usage(); } if ( -f "@sysconfdir@/ha.d/$CONFIG" ) { $CONFIG = "@sysconfdir@/ha.d/$CONFIG"; } elsif ( -f "@sysconfdir@/ha.d/conf/$CONFIG" ) { $CONFIG = "@sysconfdir@/ha.d/conf/$CONFIG"; } elsif ( ! -f "$CONFIG" ) { init_error("Config file $CONFIG not found"); } read_config(); undef @OLDVIRTUAL; { my $log_str = "Invoking ldirectord invoked as: $0 "; for my $i (@ARGV) { $log_str .= $i . " "; } ld_log($log_str); } my $oldpid; my $filepid; if (open(FILE, "<$RUNPID.$CFGNAME.pid")) { $_ = ; chomp; $filepid = $_; close(FILE); # Check to make sure this isn't a stale pid file if (open(FILE, "; if (/ldirectord/) { $oldpid = $filepid; } close(FILE); } } if (defined $oldpid) { if ($CMD eq "start") { ld_exit(0, "Exiting from ldirectord $CMD"); } elsif ($CMD eq "stop") { kill 15, $oldpid; ld_exit(0, "Exiting from ldirectord $CMD"); } elsif ($CMD eq "restart" or $CMD eq "try-restart") { kill 15, $oldpid; while (-f "$RUNPID.$CFGNAME.pid") { # wait until old pid file is removed sleep 1; } # N.B Fall through } elsif ($CMD eq "reload" or $CMD eq "force-reload") { kill 1, $oldpid; ld_exit(0, "Exiting from ldirectord $CMD"); } else { # status print STDERR "ldirectord for $CONFIG is running with pid: $oldpid\n"; ld_cmd_children("status", %LD_INSTANCE); ld_log("ldirectord for $CONFIG is running with pid: $oldpid"); ld_log("Exiting from ldirectord $CMD"); ld_exit(0, "Exiting from ldirectord $CMD"); } } else { if ($CMD eq "start" or $CMD eq "restart") { ; } elsif ($CMD eq "stop" or $CMD eq "try-restart") { ld_exit(0, "Exiting from ldirectord $CMD"); } elsif ($CMD eq "status") { my $status; if (defined $filepid) { print STDERR "ldirectord stale pid file " . "$RUNPID.$CFGNAME.pid for $CONFIG\n"; ld_log("ldirectord stale pid file " . "$RUNPID.$CFGNAME.pid for $CONFIG"); $status = 1; } else { $status = 3; } print "ldirectord is stopped for $CONFIG\n"; ld_exit($status, "Exiting from ldirectord $CMD"); } else { ld_log("ldirectord is stopped for $CONFIG"); ld_exit(1, "Exiting from ldirectord $CMD"); } } # Run as daemon if ($SUPERVISED eq "yes" || $opt_d) { &ld_log("Starting $VERSION_STR with pid: $$"); } else { &ld_log("Starting $VERSION_STR as daemon"); open(FILE, ">$RUNPID.$CFGNAME.pid") || init_error("Can not open $RUNPID.$CFGNAME.pid"); &ld_daemon(); print FILE "$$\n"; close(FILE); } } sub usage { pod2usage(-input => $LDIRECTORD, -exitval => -1); } sub init_error { my $msg = shift; chomp($msg); &ld_log("$msg"); unless ($opt_d) { print STDERR "$msg\n"; } ld_exit(1, "Initialisation Error"); } # ld_handler_term # If we get a signal then log it and quit sub ld_handler_term { my ($signal) = (@_); if (defined $DAEMON_TERM) { $SIG{'__DIE__'} = "IGNORE"; $SIG{"$signal"} = "IGNORE"; die("Exit Handler Repeatedly Called\n"); } $DAEMON_TERM = $signal; $DAEMON_STATUS = $DAEMON_STATUS_STOPPING; } sub ld_process_term { $DAEMON_STATUS = $DAEMON_STATUS_STOPPING; ld_cmd_children("stop", %LD_INSTANCE); ld_stop(); &ld_log("Linux Director Daemon terminated on signal: $DAEMON_TERM"); &ld_rm_file("$RUNPID.$CFGNAME.pid"); &ld_exit(0, "Linux Director Daemon terminated on signal: $DAEMON_TERM"); } sub ld_handler_hup { $DAEMON_HUP=1; } sub ld_process_hup { &ld_log("Reloading Linux Director Daemon config on signal"); $DAEMON_HUP = undef; &reread_config(); } sub ld_handler_chld { $DAEMON_CHLD=1; # NOTE: calling waitpid here would mess up $? } sub ld_process_chld { my $i = 0; undef $DAEMON_CHLD; while (waitpid(-1, WNOHANG) > 0) { print "child: $i\n"; $i++; } } sub check_signal { if (defined $DAEMON_TERM) { ld_process_term(); } if (defined $DAEMON_HUP) { ld_process_hup(); } if (defined $DAEMON_CHLD) { ld_process_chld(); } } sub reread_config { @OLDVIRTUAL = @VIRTUAL; @VIRTUAL = (); my %OLD_INSTANCE = %LD_INSTANCE; my %RELOAD; my %STOP; my %START; my $child; $DAEMON_STATUS = $DAEMON_STATUS_RELOADING; eval { &read_config(); foreach $child (keys %LD_INSTANCE) { if (defined $OLD_INSTANCE{$child}) { $RELOAD{$child} = 1; } else { $START{$child} = 1; } } foreach $child (keys %OLD_INSTANCE) { if (not defined $LD_INSTANCE{$child}) { $STOP{$child} = 1; } } &ld_cmd_children("stop", %STOP); &ld_cmd_children("reload_or_start", %RELOAD); &ld_cmd_children("start", %START); foreach my $vid (keys %FORK_CHILDREN) { &ld_log("Killing child $vid (PID=$FORK_CHILDREN{$vid})"); kill 15, $FORK_CHILDREN{$vid}; } &ld_setup(); &ld_start(); }; if ($@) { @VIRTUAL = @OLDVIRTUAL; %LD_INSTANCE = %OLD_INSTANCE; } $DAEMON_STATUS = $DAEMON_STATUS_RUNNING; undef @OLDVIRTUAL; } sub parse_emailalertstatus { my ($line, $arg) = (@_); my @s = split/\s*,\s*/, $arg; my $none = 0; my $status = 0; for my $i (@s) { if ($i eq "none") { $none++; } } for my $i (@s) { if ($i eq "none") { next; } elsif ($i eq "all") { $status = $DAEMON_STATUS_ALL; } elsif ($i eq "starting") { $status |= $DAEMON_STATUS_STARTING; } elsif ($i eq "stopping") { $status |= $DAEMON_STATUS_STOPPING; } elsif ($i eq "running") { $status |= $DAEMON_STATUS_RUNNING; } elsif ($i eq "reloading") { $status |= $DAEMON_STATUS_RELOADING; } else { &config_error($line, "invalid email alert status at: \"$i\"") } if ($none > 0) { &config_error($line, "invalid email alert status: " . "\"$i\" specified with \"none\""); } } return $status; } sub set_defaults { $AUTOCHECK = "no"; $CALLBACK = undef; $CHECKCOUNT = 1; $CHECKINTERVAL = 10; $CHECKTIMEOUT = -1; $CLEANSTOP = "yes"; $DEFAULT_CHECKTIMEOUT = 5; $DEFAULT_NEGOTIATETIMEOUT = 30; $EMAILALERT = ""; $EMAILALERTFREQ = 0; $EMAILALERTFROM = undef; $EMAILALERTSTATUS = $DAEMON_STATUS_ALL; $FAILURECOUNT = 1; $FALLBACK = undef; $FALLBACK6 = undef; $FALLBACKCOMMAND = undef; $FORKING = "no"; $LDIRLOG = "/var/log/ldirectord.log"; $MAINTDIR = undef; $NEGOTIATETIMEOUT = -1; $QUIESCENT = "no"; $READDQUIESCENT = "no"; $SUPERVISED = "no"; $SMTP = undef; } sub read_emailalert { my ($line, $addr) = (@_); # Strip of enclosing quotes $addr =~ s/^\"([^"]*)\"$/$1/; $addr =~ /(.+)/ or &config_error($line, "no email address specified"); return $addr; } sub read_config { undef @VIRTUAL; undef @REAL; undef $CALLBACK; undef %LD_INSTANCE; undef $checksum; # Reset/set global config variables to defaults before parsing the config file. set_defaults(); $stattime = 0; my %virtual_seen; open(CFGFILE, "<$CONFIG") or &config_error(0, "can not open file $CONFIG"); my $line = 0; my $linedata; while() { $line++; $linedata = $_; outer_loop: if ($linedata =~ /^virtual(6)?\s*=\s*(.*)/) { my $af = defined($1) ? AF_INET6 : AF_INET; my $vattr = $2; my $ip_port = undef; my $fwm = undef; my $virtual_id; my $virtual_line = $line; my $virtual_port; my $fallback_line; my @rsrv_todo; if ($vattr =~ /^(\d+\.\d+\.\d+\.\d+):([0-9A-Za-z-_]+)/ && $af == AF_INET) { $ip_port = "$1:$2"; $virtual_port = $2; } elsif ($vattr =~ /^([0-9A-Za-z._+-]+):([0-9A-Za-z-_]+)/) { $ip_port = "$1:$2"; $virtual_port = $2; } elsif ($vattr =~ /^(\d+)/){ $fwm = $1; } elsif ($vattr =~ /^\[([0-9A-Fa-f:]+)\]:([0-9A-Za-z-_]+)/ && $af == AF_INET) { &config_error($line, "cannot specify an IPv6 address here. please use \"virtual6\" instead."); } elsif ($vattr =~ /^\[([0-9A-Fa-f:]+)\]:([0-9A-Za-z-_]+)/ && $af == AF_INET6) { my $v6addr = $1; my $v6port = $2; if (!inet_pton(AF_INET6,$v6addr)) { &config_error($line,"invalid ipv6 address for virtual server"); } $ip_port = "[$v6addr]:$v6port"; $virtual_port = $v6port; } else { &config_error($line, "invalid address for virtual server"); } my (%vsrv, @rsrv); if ($ip_port) { $vsrv{checktype} = "negotiate"; $vsrv{protocol} = "tcp"; if ($ip_port =~ /:(53|domain)$/) { $vsrv{protocol} = "udp"; } $vsrv{port} = $virtual_port; } else { $vsrv{fwm} = $fwm; $vsrv{checktype} = "negotiate"; $vsrv{protocol} = "fwm"; $vsrv{service} = "none"; $vsrv{port} = "0"; } $vsrv{addressfamily} = $af; $vsrv{real} = \@rsrv; $vsrv{scheduler} = "wrr"; $vsrv{checkcommand} = "/bin/true"; $vsrv{request} = "/"; $vsrv{receive} = ""; $vsrv{login} = ""; $vsrv{passwd} = ""; $vsrv{database} = ""; $vsrv{checktimeout} = -1; $vsrv{checkcount} = -1; $vsrv{negotiatetimeout} = -1; $vsrv{failurecount} = -1; $vsrv{num_connects} = 0; $vsrv{httpmethod} = "GET"; $vsrv{secret} = ""; $vsrv{ops} = "no"; push(@VIRTUAL, \%vsrv); while() { $line++; $linedata=$_; if(m/^\s*#/) { next; } s/#.*//; s/\t/ /g; unless (/^ {4,}(.+)/) { last; } my $rcmd = $1; if ($rcmd =~ /^(real(6)?)\s*=\s*(.*)/) { if ($af == AF_INET && defined($2) || $af == AF_INET6 && ! defined($2)) { &config_error($line, join("", ("cannot specify \"$1\" here. please use \"real", ($af == AF_INET) ? "" : "6", "\" instead"))); } push @rsrv_todo, [$3, $line]; } elsif ($rcmd =~ /^request\s*=\s*\"(.*)\"/) { $1 =~ /(.+)/ or &config_error($line, "no request string specified"); $vsrv{request} = $1; unless($vsrv{request}=~/^\//){ $vsrv{request} = "/" . $vsrv{request}; } } elsif ($rcmd =~ /^receive\s*=\s*\"(.*)\"/) { $1 =~ /(.+)/ or &config_error($line, "invalid receive string"); $vsrv{receive} = $1; } elsif ($rcmd =~ /^checktype\s*=\s*(.*)/){ if ($1 =~ /(\d+)/ && $1>=0) { $vsrv{num_connects} = $1; $vsrv{checktype} = "combined"; } elsif ( $1 =~ /([\w-]+)/ && ($1 eq "connect" || $1 eq "negotiate" || $1 eq "ping" || $1 eq "off" || $1 eq "on" || $1 eq "external" || $1 eq "external-perl") ) { $vsrv{checktype} = $1; } else { &config_error($line, "checktype must be \"connect\", \"negotiate\", \"on\", \"off\", \"ping\", \"external\", \"external-perl\" or a positive number"); } } elsif ($rcmd =~ /^checkcommand\s*=\s*\"(.*)\"/ or $rcmd =~ /^checkcommand\s*=\s*(.*)/){ $1 =~ /(.+)/ or &config_error($line, "invalid check command"); $vsrv{checkcommand} = $1; } elsif ($rcmd =~ /^checktimeout\s*=\s*(.*)/){ $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid check timeout"); $vsrv{checktimeout} = $1; } elsif ($rcmd =~ /^connecttimeout\s*=\s*(.*)/){ &config_error($line, "connecttimeout directive " . "deprecated in favour of " . "negotiatetimeout"); } elsif ($rcmd =~ /^negotiatetimeout\s*=\s*(.*)/){ $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid negotiate timeout"); $vsrv{negotiatetimeout} = $1; } elsif ($rcmd =~ /^checkcount\s*=\s*(.*)/){ $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid check count"); $vsrv{checkcount} = $1; &config_warn($line, "checkcount option is deprecated and slated for removal. please see 'failurecount'"); } elsif ($rcmd =~ /^failurecount\s*=\s*(.*)/){ $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid failure count"); $vsrv{failurecount} = $1; } elsif ($rcmd =~ /^checkinterval\s*=\s*(.*)/){ $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid checkinterval"); $vsrv{checkinterval} = $1 } elsif ($rcmd =~ /^checkport\s*=\s*(.*)/){ $1 =~ /(\d+)/ or &config_error($line, "invalid port"); ( $1 > 0 && $1 < 65536 ) or &config_error($line, "checkport must be in range 1..65536"); $vsrv{checkport} = $1; } elsif ($rcmd =~ /^login\s*=\s*\"(.*)\"/) { $1 =~ /(.+)/ or &config_error($line, "invalid login string"); $vsrv{login} = $1; } elsif ($rcmd =~ /^passwd\s*=\s*\"(.*)\"/) { $1 =~ /(.+)/ or &config_error($line, "invalid password"); $vsrv{passwd} = $1; } elsif ($rcmd =~ /^database\s*=\s*\"(.*)\"/) { $1 =~ /(.+)/ or &config_error($line, "invalid database"); $vsrv{database} = $1; } elsif ($rcmd =~ /^secret\s*=\s*\"(.*)\"/) { $1 =~ /(.+)/ or &config_error($line, "invalid secret"); $vsrv{secret} = $1; } elsif ($rcmd =~ /^load\s*=\s*\"(.*)\"/) { $1 =~ /(\w+)/ or &config_error($line, "invalid string for load testing"); $vsrv{load} = $1; } elsif ($rcmd =~ /^scheduler\s*=\s*(.*)/) { # Intentionally ommit checking the # scheduler against a list of know # schedulers. This is because from # time to time new schedulers are # added. But ldirectord is # maintained distributed # independently of this. Thus # ldirectord needs to be manually # updated/upgraded. So just accept # any scheduler that matches # [a-z]+. I.e. is syntactically # correct (all schedulers so far # match that pattern). Ipvsadm will # report an error is a scheduler # isn't available / doesn't exist. $1 =~ /([a-z]+)/ or &config_error($line, "invalid scheduler, should be only lowercase letters (a-z)"); $vsrv{scheduler} = $1; } elsif ($rcmd =~ /^persistent\s*=\s*(.*)/) { $1 =~ /(\d+)/ or &config_error($line, "invalid persistent timeout"); $vsrv{persistent} = $1; } elsif ($rcmd =~ /^netmask\s*=\s*(.*)/) { my $val = $1; if ($vsrv{addressfamily} == AF_INET6) { if ($val !~ /^\d+$/ or ($val < 1 || $val > 128)) { &config_error($line, "invalid netmask: a prefix length between 1 and 128 is required"); } } else { if ($val !~ /^\d+\.\d+\.\d+\.\d+$/) { &config_error($line, "invalid netmask: dotted quad notation is required"); } } $vsrv{netmask} = $val; } elsif ($rcmd =~ /^protocol\s*=\s*(.*)/) { if ( $1 =~ /(\w+)/ ) { if ( $vsrv{protocol} eq "fwm" ) { if ($1 eq "fwm" ) { ; #Do nothing, it is already set } else { &config_error($line, "protocol must be fwm if the virtual service is a fwmark (a number)"); } } else { # tcp or udp if ($1 eq "tcp" || $1 eq "udp") { $vsrv{protocol} = $1; } else { &config_error($line, "protocol must be tcp or udp if the virtual service is an address and port"); } } } else { &config_error($line, "invalid protocol"); } } elsif ($rcmd =~ /^ops\s*=\s*(.*)/) { if ($1 eq "yes" || $1 eq "no") { $vsrv{ops} = $1; } else { &config_error($line, "ops must be 'yes' or 'no'"); } } elsif ($rcmd =~ /^service\s*=\s*(.*)/) { $1 =~ /(\w+)/ && ($1 eq "dns" || $1 eq "ftp" || $1 eq "http" || $1 eq "https" || $1 eq "http_proxy" || $1 eq "imap" || $1 eq "imaps" || $1 eq "ldap" || $1 eq "nntp" || $1 eq "mysql" || $1 eq "none" || $1 eq "oracle"|| $1 eq "pop" || $1 eq "pops" || $1 eq "radius"|| $1 eq "pgsql" || $1 eq "sip" || $1 eq "smtp" || $1 eq "submission" || $1 eq "simpletcp") or &config_error($line, "service must " . "be dns, ftp, " . "http, https, " . "http_proxy, " . "imap, imaps, " . "ldap, nntp, " . "mysql, none, " . "oracle, " . "pop, pops, " . "radius, " . "pgsql, " . "simpletcp, " . "sip, smtp " . "or submission"); $vsrv{service} = $1; if($vsrv{service} eq "ftp" and $vsrv{login} eq "") { $vsrv{login} = "anonymous"; } elsif($vsrv{service} eq "sip" and $vsrv{login} eq "") { $vsrv{login} = "ldirectord\@$HOSTNAME"; } if($vsrv{service} eq "ftp" and $vsrv{passwd} eq "") { $vsrv{passwd} = "ldirectord\@$HOSTNAME"; } } elsif ($rcmd =~ /^httpmethod\s*=\s*(.*)/) { $1 =~ /(\w+)/ && (uc($1) eq "GET" || uc($1) eq "HEAD") or &config_error($line, "httpmethod must be GET or HEAD"); $vsrv{httpmethod} = uc($1); } elsif ($rcmd =~ /^virtualhost\s*=\s*(.*)/) { $1 =~ /\"?([^\"]*)\"?/ or &config_error($line, "invalid virtualhost"); $vsrv{virtualhost} = $1; } elsif ($rcmd =~ /^(fallback(6)?)\s*=\s*(.*)/) { # Allow specification of a virtual-specific fallback host if ($af == AF_INET && defined($2) || $af == AF_INET6 && ! defined($2)) { &config_error($line, join("", ("cannot specify \"$1\" here. please use \"fallback", ($af == AF_INET) ? "" : "6", "\" instead"))); } $fallback_line=$line; $vsrv{fallback} = parse_fallback($line, $3, \%vsrv); } elsif ($rcmd =~ /^fallbackcommand\s*=\s*\"(.*)\"/ or $rcmd =~ /^fallbackcommand\s*=\s*(.*)/) { $1 =~ /(.+)/ or &config_error($line, "invalid fallback command"); $vsrv{fallbackcommand} = $1; } elsif ($rcmd =~ /^quiescent\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "quiescent must be 'yes' or 'no'"); $vsrv{quiescent} = $1; } elsif ($rcmd =~ /^emailalert\s*=\s*(.*)/) { $vsrv{emailalert} = read_emailalert($line, $1); } elsif ($rcmd =~ /^emailalertfreq\s*=\s*(\d*)/) { $1 =~ /(\d+)/ or &config_error($line, "invalid email alert frequency"); $vsrv{emailalertfreq} = $1; } elsif ($rcmd =~ /^emailalertstatus\s*=\s*(.*)/) { $vsrv{emailalertstatus} = &parse_emailalertstatus($line, $1); } elsif ($rcmd =~ /^monitorfile\s*=\s*\"(.*)\"/ or $rcmd =~ /^monitorfile\s*=\s*(.*)/) { my $monitorfile = $1; unless (open(MONITORFILE, ">>$monitorfile") and close(MONITORFILE)) { &config_error($line, "unable to open monitorfile $monitorfile: $!"); } $vsrv{monitorfile} = $monitorfile; } elsif ($rcmd =~ /^cleanstop\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "cleanstop must be 'yes' or 'no'"); $vsrv{cleanstop} = $1; } elsif ($rcmd =~ /^smtp\s*=\s*(.*)/) { $1 =~ /(^([0-9A-Za-z._+-]+))/ or &config_error($line, "invalid SMTP server address"); $vsrv{smtp} = $1; + } elsif ($rcmd =~ /^servicename\s*=\s*(.*)/) { + $vsrv{servicename} = $1; + } elsif ($rcmd =~ /^comment\s*=\s*(.*)/) { + $vsrv{comment} = $1; } else { &config_error($line, "Unknown command \"$linedata\""); } undef $linedata; } # As the protocol needs to be known to call # getservbyname() all resolution must be # delayed until the protocol is finalised. # That is after the entire configuration # for a virtual service has been parsed. &_ld_read_config_fallback_resolve($fallback_line, $vsrv{protocol}, $vsrv{fallback}, $af); &_ld_read_config_virtual_resolve($virtual_line, \%vsrv, $ip_port, $af); &_ld_read_config_real_resolve(\%vsrv, \@rsrv_todo, $af); # Check for duplicate now we have all the # information to generate the id $virtual_id = get_virtual_id_str(\%vsrv); if (defined $virtual_seen{$virtual_id}) { &config_error($line, "duplicate virtual server"); } $virtual_seen{$virtual_id} = 1; unless(defined($linedata)) { last; } #Arggh a goto :( goto outer_loop; } next if ($linedata =~ /^\s*$/ || $linedata =~ /^\s*#/); if ($linedata =~ /^checktimeout\s*=\s*(.*)/) { ($1 =~ /(\d+)/ && $1 && $1>0) or &config_error($line, "invalid check timeout value"); $CHECKTIMEOUT = $1; } elsif ($linedata =~ /^connecttimeout\s*=\s*(.*)/) { &config_error($line, "connecttimeout directive " . "deprecated in favour of " . "negotiatetimeout"); } elsif ($linedata =~ /^negotiatetimeout\s*=\s*(.*)/) { ($1 =~ /(\d+)/ && $1 && $1>0) or &config_error($line, "invalid negotiate timeout value"); $NEGOTIATETIMEOUT = $1; } elsif ($linedata =~ /^checkinterval\s*=\s*(.*)/) { $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid check interval value"); $CHECKINTERVAL = $1; } elsif ($linedata =~ /^checkcount\s*=\s*(.*)/) { $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid check count value"); $CHECKCOUNT = $1; &config_warn($line, "checkcount option is deprecated and slated for removal. please see 'failurecount'"); } elsif ($linedata =~ /^failurecount\s*=\s*(.*)/) { $1 =~ /(\d+)/ && $1 or &config_error($line, "invalid failure count value"); $FAILURECOUNT = $1; } elsif ($linedata =~ /^fallback(6)?\s*=\s*(.*)/) { my $af = defined($1) ? AF_INET6 : AF_INET; my $tcp = parse_fallback($line, $2, undef); my $udp = parse_fallback($line, $2, undef); &_ld_read_config_fallback_resolve($line, "tcp", $tcp, $af); &_ld_read_config_fallback_resolve($line, "udp", $udp, $af); if ($af == AF_INET) { $FALLBACK = { "tcp" => $tcp, "udp" => $udp }; } else { $FALLBACK6 = { "tcp" => $tcp, "udp" => $udp }; } } elsif ($linedata =~ /^fallbackcommand\s*=\s*(.*)/) { $1 =~ /(.+)/ or &config_error($line, "invalid fallback command"); $FALLBACKCOMMAND = $1; } elsif ($linedata =~ /^autoreload\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "autoreload must be 'yes' or 'no'"); $AUTOCHECK = $1; } elsif ($linedata =~ /^callback\s*=\s*\"(.*)\"/) { $CALLBACK = $1; } elsif ($linedata =~ /^logfile\s*=\s*\"(.*)\"/) { my $tmpLDIRLOG = $LDIRLOG; $LDIRLOG = $1; if (&ld_openlog()) { $LDIRLOG = $tmpLDIRLOG; &config_error($line, "unable to open logfile: $1"); } } elsif ($linedata =~ /^execute\s*=\s*(.*)/) { $LD_INSTANCE{$1} = 1; } elsif ($linedata =~ /^fork\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "fork must be 'yes' or 'no'"); $FORKING = $1; } elsif ($linedata =~ /^supervised/) { if (($linedata =~ /^supervised\s*=\s*(.*)/) and ($1 eq "yes" || $1 eq "no")) { $SUPERVISED = $1; } elsif ($linedata =~ /^supervised\s*$/) { $SUPERVISED = "yes"; &config_warn($line, "please update your config not to " . "use a bare supervised directive"); } else { &config_error($line, "supervised must be 'yes' or 'no'"); } } elsif ($linedata =~ /^quiescent\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "quiescent must be 'yes' or 'no'"); $QUIESCENT = $1; } elsif ($linedata =~ /^readdquiescent\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "readdquiescent must be 'yes' or 'no'"); $READDQUIESCENT = $1; } elsif ($linedata =~ /^emailalert\s*=\s*(.*)/) { $EMAILALERT = read_emailalert($line, $1); } elsif ($linedata =~ /^emailalertfreq\s*=\s*(\d*)/) { $1 =~ /(\d+)/ or &config_error($line, "invalid email alert frequency"); $EMAILALERTFREQ = $1; } elsif ($linedata =~ /^emailalertstatus\s*=\s*(.*)/) { $EMAILALERTSTATUS = &parse_emailalertstatus($line, $1); } elsif ($linedata =~ /^emailalertfrom\s*=\s*(.*)/) { $1 =~ /(.+)/ or &config_error($line, "no email from address specified"); $EMAILALERTFROM = $1; } elsif ($linedata =~ /^cleanstop\s*=\s*(.*)/) { ($1 eq "yes" || $1 eq "no") or &config_error($line, "cleanstop must be 'yes' or 'no'"); $CLEANSTOP = $1; } elsif ($linedata =~ /^smtp\s*=\s*(.*)/) { $1 =~ /(^([0-9A-Za-z._+-]+))/ or &config_error($line, "invalid SMTP server address"); $SMTP = $1; } elsif ($linedata =~ /^maintenancedir\s*=\s*(.*)/) { $1 =~ /(.+)/ or &config_error($line, "maintenance directory not specified"); $MAINTDIR = $1; -d $MAINTDIR or &config_warn($line, "maintenance directory does not exist"); } else { if ($linedata =~ /^timeout\s*=\s*(.*)/) { &config_error($line, "timeout directive " . "deprecated in favour of " . "checktimeout and " . "negotiatetimeout"); } &config_error($line, "Unknown command $linedata "); } } close(CFGFILE); # Check for sensible use of checkinterval, warn if it is used in a virtual # service when fork=no if ($FORKING eq 'no') { foreach my $v (@VIRTUAL) { if (defined($$v{checkinterval})) { config_warn(-1, "checkinterval in virtual service ". get_virtual_id_str($v)." ignored when fork=no"); } } } return(0); } # _ld_read_config_virtual_resolve # Note: Should not need to be called directly, but won't do any damage if # you do. # Resolve the server (ip address) and port for a virtual service # pre: line: Line of configuration file fallback server was read from # Used for debugging messages # vsrv: Virtual Service to resolve server and port of # ip_port: server and port in the form # ip_address|hostname:port|service # af: Address family: AF_INET or AF_INET6 # post: Take ip_port, resolve it as per ld_gethostservbyname # and set $vsrv->{server} and $vsrv->{port} accordingly. # If $vsrv->{service} is not set, then set according to the value of # $vsrv->{port} # return: none # Debugging message will be reported and programme will exit # on error. sub _ld_read_config_virtual_resolve { my($line, $vsrv, $ip_port, $af)=(@_); if($ip_port){ $ip_port=&ld_gethostservbyname($ip_port, $vsrv->{protocol}, $af); if ($ip_port =~ /(\[[0-9A-Fa-f:]+\]):(\d+)/) { $vsrv->{server} = $1; $vsrv->{port} = $2; } elsif($ip_port){ ($vsrv->{server}, $vsrv->{port}) = split /:/, $ip_port; } else { &config_error($line, "invalid address for virtual service"); } if(!defined($vsrv->{service})){ $vsrv->{service} = ld_port_to_service($vsrv->{port}); } } } # ld_service_to_port # Resolve an ldirectord service name from its port number # pre: port: port number of the service # return: port name # "none" if the service is unknown sub ld_port_to_service { my ($port) = (@_); if ($port eq 21) { return "ftp"; } if ($port eq 25) { return "smtp"; } if ($port eq 53) { return "dns"; } if ($port eq 80) { return "http"; } if ($port eq 110) { return "pop"; } if ($port eq 119) { return "nntp"; } if ($port eq 143) { return "imap"; } if ($port eq 389) { return "ldap"; } if ($port eq 443) { return "https"; } if ($port eq 587) { return "submission"; } if ($port eq 995) { return "pops"; } if ($port eq 993) { return "imaps"; } if ($port eq 1521) { return "oracle"; } if ($port eq 1812) { return "radius"; } if ($port eq 3128) { return "http_proxy"; } if ($port eq 3306) { return "mysql"; } if ($port eq 5060) { return "sip"; } if ($port eq 5432) { return "pgsql"; } return "none"; } # ld_service_to_port # Resolve the port number from an ldirectord service name # pre: service: name of the service # return: port number # undef if the service is unknown sub ld_service_to_port { my ($service) = (@_); if ($service eq "ftp") { return 21; } if ($service eq "smtp") { return 25; } if ($service eq "dns") { return 53; } if ($service eq "http") { return 80; } if ($service eq "pop") { return 110; } if ($service eq "nntp") { return 119; } if ($service eq "imap") { return 143; } if ($service eq "ldap") { return 389; } if ($service eq "https") { return 443; } if ($service eq "submission") { return 587; } if ($service eq "imaps") { return 993; } if ($service eq "pops") { return 995; } if ($service eq "oracle") { return 1521; } if ($service eq "radius") { return 1812; } if ($service eq "http_proxy") { return 3128; } if ($service eq "mysql") { return 3306; } if ($service eq "sip") { return 5060; } if ($service eq "pgsql") { return 5432; } return undef; } # ld_checkport # Resolve the port to connect to for service checks # Note: Should only be used inside service checks, # as its not the same as the port of the real server # pre: v: virtual service # r: real server # return: port number # undef if the service is unknown sub ld_checkport { my ($v, $r) = (@_); if (defined $v->{checkport}) { return $v->{checkport}; } if ($r->{port} > 0) { return $r->{port}; } return ld_service_to_port($v->{service}); } # _ld_read_config_fallback_resolve # Note: Should not need to be called directly, but won't do any damage if # you do. # Resolve the fallback server for a virtual service # pre: line: Line of configuration file fallback server was read from # Used for debugging messages # vsrv: Virtual Service to resolve fallback server of # af: Address family: AF_INET or AF_INET6 # post: Take $vsrv->{fallback}, resolve it as per ld_gethostservbyname # and set $vsrv->{fallback} to the result # return: none # Debugging message will be reported and programme will exit # on error. sub _ld_read_config_fallback_resolve { my($line, $protocol, $fallback, $af)=(@_); my ($ipversion, $ipaddress); unless($fallback) { return; } if ($af == AF_INET) { $ipversion = "IPv4"; } elsif ($af == AF_INET6) { $ipversion = "IPv6"; } else { $ipversion = "IP??($af)"; } unless ($ipaddress = &ld_gethostbyname($fallback->{server}, $af)) { &config_error($line, "invalid $ipversion address or could not resolve for fallback server: " . $fallback->{server}); } $fallback->{server} = $ipaddress; unless($fallback->{"port"}) { return; } $fallback->{port} = &ld_getservbyname($fallback->{port}, $protocol) or &config_error($line, "invalid port for fallback server"); } # _ld_read_config_real_resolve # Note: Should not need to be called directly, but won't do any damage if # you do. # Run through the list of real servers read in the configuration file for a # virtual server and parse these entries # pre: vsrv: Virtual Service to parse real servers for # rsrv_todo: List of real servers read from config but not parsed. # List is a list of list reference. The first element in # each list reference is the line read from the # configuration after "real=". The second element is the # line number, used for error reporting # af: Address family: AF_INET or AF_INET6 # post: Run through rsrv_todo and parse real servers # return: none # Debugging message will be reported and programme will exit # on error. sub _ld_read_config_real_resolve { my ($vsrv, $rsrv_todo, $af)=(@_); my $i; my $str; my $line; my $ip1; my $ip2; my $port; my $resolved_ip1; my $resolved_ip2; my $resolved_port; my $flags; for $i (@$rsrv_todo) { ($str, $line)=@$i; $str =~ /(\d+\.\d+\.\d+\.\d+|[A-Za-z0-9.-]+|\[[0-9A-fa-f:]+\])(->(\d+\.\d+\.\d+\.\d+|[A-Za-z0-9.-]+|\[[0-9A-fa-f:]+\]))?(:(\d+|[A-Za-z0-9-_]+))?\s+(.*)/ or &config_error($line, "invalid address for real server" . " (wrong format)"); $ip1=$1; $ip2=$3; if(defined($5)){ $port=$5; } else { $port="0"; } $flags=$6; $resolved_ip1=&ld_gethostbyname($ip1, $af); unless( defined($resolved_ip1) ) { &config_error($line, "invalid address ($ip1) for real server" . " (could not resolve host)"); } if( defined($port) ){ $resolved_port=&ld_getservbyname($port,""); unless( defined($resolved_port) ){ &config_error($line, "invalid port ($port) for real server" . " (could not resolve port)"); } } if ( defined ($ip2) ) { $resolved_ip2=&ld_gethostbyname($ip2, $af); unless( defined ($resolved_ip2) ) { &config_error($line, "invalid address ($ip2) for " . "real server" . " (could not resolve end host)"); } &add_real_server_range($line, $vsrv, $resolved_ip1, $resolved_ip2, $resolved_port, $flags, $af); } else { &add_real_server($line, $vsrv, $resolved_ip1, $resolved_port, $flags); } } } # add_real_server_range # Add a real server for each IP address in a range # pre: line: line number real server was read from # Used for debugging information # vsrv: virtual server to add real server to # first: First IP address in range # last: First IP address in range # port: Port of real servers # flags: Flags for real servers. Should be of the form # gate|masq|ipip [] [">I", ""] # af: Address family: AF_INET or AF_INET6 # post: real servers are added to virtual server # return: none # Debugging message will be reported and programme will exit # on error. sub add_real_server_range { my ($line, $vsrv, $first, $last, $port, $flags, $af) = (@_); my (@tmp, $first_i, $last_i, $i, $rsrv); if ($af == AF_INET) { if ( ($first_i=&ip_to_int($first)) <0 ) { &config_error($line, "Invalid IP address: $first"); } if ( ($last_i=&ip_to_int($last)) <0 ) { &config_error($line, "Invalid IP address: $last"); } if ($first_i>$last_i) { &config_error($line, "Invalid Range: $first-$last: First value must be " . "greater than or equal to the second value"); } # A for loop didn't seem to want to work $i=$first_i; while ( $i le $last_i ) { &add_real_server($line, $vsrv, &int_to_ip($i), $port, $flags); $i++; } } elsif ($af == AF_INET6) { # not supported yet &config_error($line, "Address range for IPv6 is not supported yet"); } else { die "address family must be AF_INET or AF_INET6\n"; } } # add_real_server # Add a real server to a virtual # pre: line: line number real server was read from # Used for debugging information # vsrv: virtual server to add real server to # ip: IP address of real server # port: Port of real server # flags: Flags for real server. Should be of the form # gate|masq|ipip [] [">I", ""] # post: real server is added to virtual server # return: none # Debugging message will be reported and programme will exit # on error. sub add_real_server { my ($line, $vsrv, $ip, $port, $flags) = (@_); my $ref; my $realsrv=0; my $new_rsrv; my $rsrv; - $new_rsrv = {"server"=>$ip, "port"=>$port}; + $new_rsrv = {"server"=>$ip, "port"=>$port, "failcount"=>0}; $flags =~ /(\w+)(.*)/ && ($1 eq "gate" || $1 eq "masq" || $1 eq "ipip") or &config_error($line, "forward method must be gate, masq or ipip"); $new_rsrv->{"forward"} =$1; $flags = $2; $rsrv=$vsrv->{"real"}; if(defined($flags) and $flags =~ /\s+(\d+)(.*)/) { $new_rsrv->{"weight"} = $1; $flags = $2; } else { $new_rsrv->{"weight"} = 1; } if(defined($flags) and $flags =~ /\s+\"(.*)\"[, ]\s*\"(.*)\"(.*)/) { $new_rsrv->{"request"} = $1; unless ($new_rsrv->{request}=~/^\//) { $new_rsrv->{request} = "/" . $new_rsrv->{request}; } $new_rsrv->{"receive"} = $2; $flags = $3; } if (defined($flags) and $flags =~/\S/) { &config_error($line, "Invalid real server line, around " . "\"$flags\""); } push(@$rsrv, $new_rsrv); my $real = get_real_id_str($new_rsrv, $vsrv); my $virtual = get_virtual_id_str($vsrv); for my $r (@REAL){ if($r->{"real"} eq $real){ my $ref=$r->{"virtual"}; push(@$ref, $virtual); $realsrv=1; last; } } if($realsrv==0){ push(@REAL, { "real"=>$real, "virtual"=>[ $virtual ] }); } } # parse_fallback # Parse a fallback server # pre: line: line number real server was read from # fallback: line read from configuration file # Should be of the form # ip_address|hostname[:port|:service_name] [gate|masq|ipip] # post: fallback is parsed # return: Reference to hash of the form # { server => blah, forward => blah } # Debugging message will be reported and programme will exit # on error. sub parse_fallback { my ($line, $fallback, $vsrv) = (@_); my $parse_line; my $server; my $port; my $fwd; $parse_line = $fallback; if ($parse_line =~ /(\S+)(\s+(\S+))?\s*$/) { # get "ip:port" and a forwarding method $fwd = $3; $parse_line = $1; } if ($parse_line =~ /(:(\d+|[A-Za-z0-9-_]+))?$/) { # get host and port $port=$2; $parse_line =~ s/(:(\d+|[A-Za-z0-9-_]+))?$//; $server = $parse_line; } unless(defined($server)) { &config_error($line, "invalid fallback server: $fallback"); } if (not defined($port) and defined($vsrv)) { $port = $vsrv->{"port"}; } if($fwd) { ($fwd eq "gate" || $fwd eq "masq" || $fwd eq "ipip") or &config_error($line, "forward method must be gate, masq or ipip"); } else { $fwd="gate" } return({"server"=>$server, "port"=>$port, "forward"=>$fwd, "weight"=>1}); } sub __config_log { my ($line, $prefix, $msg) = @_; chomp($msg); $msg .= "\n"; my $msg_prefix = "$prefix [$$]"; if ($line > 0) { $msg_prefix .= " reading file $CONFIG at line $line"; } $msg = "$msg_prefix: $msg"; if ($opt_d or $DAEMON_STATUS == $DAEMON_STATUS_STARTING) { print STDERR $msg; } else { &ld_log("$msg"); } } sub config_warn { my ($line, $msg) = @_; __config_log($line, "Warning", $msg); } sub config_error { my ($line, $msg) = @_; __config_log($line, "Error", $msg); if ($DAEMON_STATUS == $DAEMON_STATUS_STARTING) { &ld_rm_file("$RUNPID.$CFGNAME.pid"); &ld_exit(2, "config_error: Configuration Error"); } else { die; } } sub ld_setup { for my $v (@VIRTUAL) { if ($$v{protocol} eq "tcp") { $$v{proto} = "-t"; } elsif ($$v{protocol} eq "udp") { $$v{proto} = "-u"; } elsif ($$v{protocol} eq "fwm") { $$v{proto} = "-f"; } $$v{flags} = "$$v{proto} " . &get_virtual_option($v) . " "; if ($$v{protocol} eq "udp" && $$v{ops} eq "yes") { $$v{flags} .= "-o "; } $$v{flags} .= "-s $$v{scheduler} " if defined ($$v{scheduler}); if (defined $$v{persistent}) { $$v{flags} .= "-p $$v{persistent} "; $$v{flags} .= "-M $$v{netmask} " if defined ($$v{netmask}); } my $real = $$v{real}; for my $r (@$real) { $$r{forw} = get_forward_flag($$r{forward}); my $port=ld_checkport($v, $r); my $schema = $$v{service}; if ($$v{service} eq 'http_proxy') { $schema = 'http'; } if (defined $$r{request} && defined $$r{receive}) { my $uri = $$r{request}; $uri =~ s/^\///g; if ($$r{request} =~ /$schema:\/\//) { $$r{url} = "$uri"; } else { $$r{url} = "$schema:\/\/$$r{server}:$port\/$uri"; } } else { my $uri = $$v{request}; $uri =~ s/^\///g; if ($$v{service} eq 'http_proxy') { $$r{url} = "$uri"; } else { $$r{url} = "$schema:\/\/$$r{server}:$port\/$uri"; } $$r{request} = $$v{request} unless defined $$r{request}; $$r{receive} = $$v{receive}; } if ($$v{checktype} eq "combined") { $$r{num_connects} = 999999; } else { $$r{num_connects} = -1; } } # checktimeout and negotiate timeout are # mutual defaults for each other, so calculate # checktimeout in a temporary variable so as not # to affect the calculation of negotiatetimeout. my $checktimeout = $$v{checktimeout}; if ($checktimeout < 0) { $checktimeout = $$v{negotiatetimeout}; } if ($checktimeout < 0) { $checktimeout = $CHECKTIMEOUT; } if ($checktimeout < 0) { $checktimeout = $NEGOTIATETIMEOUT; } if ($checktimeout < 0) { $checktimeout = $DEFAULT_CHECKTIMEOUT; } if ($$v{negotiatetimeout} < 0) { $$v{negotiatetimeout} = $$v{checktimeout}; } if ($$v{negotiatetimeout} < 0) { $$v{negotiatetimeout} = $NEGOTIATETIMEOUT; } if ($$v{negotiatetimeout} < 0) { $$v{negotiatetimeout} = $CHECKTIMEOUT; } if ($$v{negotiatetimeout} < 0) { $$v{negotiatetimeout} = $DEFAULT_NEGOTIATETIMEOUT; } $$v{checktimeout} = $checktimeout; if ($$v{checkcount} < 0) { $$v{checkcount} = $CHECKCOUNT; } if ($$v{failurecount} < 0) { $$v{failurecount} = $FAILURECOUNT; } } } # ld_read_ipvsadm # # Net::FTP seems to set the input record separator ($\) to null # putting IO into slurp (whole file at a time, rather than line at a time) # mode. Net::FTP does this using local $\, which should mean # that the change doesn' effect code here, but it does. It also # seems to be impossible to turn it off, by say setting $\ back to '\n' # Perhaps there is more to this than meets the eye. Perhaps it's a perl bug. # In any case, this should fix the problem. # # This should not affect pid or config file parsing as they are called # before Net::FTP and as this appears to be a bit of a work around, # I'd rather use it in as few places as possible # # Observed with perl v5.8.8 (Debian's perl 5.8.8-6) # -- Horms, 17th July 2005 sub ld_readline { my ($fd, $buf) = (@_); my $line; # Uncomment the following line to turn off this work around # return readline($fd); $line = shift @$buf; if (defined $line) { return $line . "\n"; } push @$buf, split /\n/, readline($fd); $line = shift @$buf; if (defined $line) { return $line . "\n"; } return undef; } # ld_read_ipvsadm # Parses the output of "ipvsadm -L -n" and puts into a structure of # the following from: # # { # (vip_address:vport|fwmark) protocol => { # "scheduler" => scheduler, # "persistent" => timeout, # May be omitted # "netmask" => netmask, # May be omitted # "real" => { # rip_address:rport => { # "forward" => forwarding_mechanism, # "weight" => weight # }, # ... # } # }, # ... # } # # where: # vip_address: IP address of virtual service # vport: Port of virtual service # fwmark: Firewall Mark of virtual service # scheduler: Scheduler for virtual service # timeout: Timeout for persistency. Omitted if service is not persistent. # nemask: Netmask for persistency. Omitted if service is not persistent. # # rip_address: IP address of real server # rport: Port of real server # forwarding_mechanism: Forwarding mechanism for real server. # One of: gate, ipip, masq. # weight: Weight of real server # # pre: none # post: ipvsadm -L -n is parsed # result: reference to sructure detailed above. sub ld_read_ipvsadm { my %oldsrv; my $real_service; my $fwd; my $buf = []; my $fh; my $line; # read status of current ipvsadm -L -n unless(open($fh, "$IPVSADM -L -n 2>&1|")){ &ld_exit(1, "Could not run $IPVSADM -L -n: $!"); } # Skip the first three lines $line = ld_readline($fh, $buf); $line = ld_readline($fh, $buf); $line = ld_readline($fh, $buf); while (1) { $line = ld_readline($fh, $buf); if (not defined $line) { last; } if ($line =~ /^(\w+)\s+(\d+\.\d+\.\d+\.\d+\:\d+|\[[0-9A-Fa-f:]+\]:\d+|\d+)( IPv6)?\s+(\w+)\s+persistent\s+(\d+)\s+mask\s+(.*)/) { $real_service = &gen_real_service_str($2, $1, $3); $oldsrv{"$real_service"} = {"real"=>{}, "scheduler"=>$4, "persistent"=>$5, "netmask"=>$6}; } elsif ($line =~ /^(\w+)\s+(\d+\.\d+\.\d+\.\d+\:\d+|\[[0-9A-Fa-f:]+\]:\d+|\d+)( IPv6)?\s+(\w+)\s+persistent\s+(\d+)/) { $real_service = &gen_real_service_str($2, $1, $3); $oldsrv{"$real_service"} = {"real"=>{}, "scheduler"=>$4, "persistent"=>$5}; } elsif ($line =~ /^(\w+)\s+(\d+\.\d+\.\d+\.\d+\:\d+|\[[0-9A-Fa-f:]+\]:\d+|\d+)( IPv6)?\s+(\w+)/) { $real_service = &gen_real_service_str($2, $1, $3); $oldsrv{"$real_service"} = {"real"=>{}, "scheduler"=>$4}; } elsif ($line =~ /^ ->\s+(\d+\.\d+\.\d+\.\d+\:\d+|\[[0-9A-Fa-f:]+\]:\d+)\s+(\w+)\s+(\d+)/) { if (not defined( $real_service)) { &ld_debug(2, "Real server read from ipvsadm " . "doesn't seem to be inside a " . "virtual service: \"$line\"\n"); next; } if ($2 eq "Route") { $fwd = "gate"; } elsif ($2 eq "Tunnel") { $fwd = "ipip"; } elsif ($2 eq "Masq") { $fwd = "masq"; } $oldsrv{"$real_service"}->{"real"}->{"$1"} = {"forward"=>$fwd, "weight"=>$3}; } else { &ld_debug(2, "Unknown line read from ipvsadm: " . "\"$line\"\n"); next; } } close($fh); return(\%oldsrv); } sub gen_real_service_str { my ($service_address, $protocol, $v6flag) = @_; return "$service_address ".lc($protocol).(defined($v6flag) ? "6" : ""); } sub get_real_service_str { my ($v) = (@_); if ($v->{"protocol"} eq "fwm") { return &get_virtual($v) . " " . $v->{protocol} . (($v->{addressfamily} == AF_INET6) ? "6" : ""); } else { return &get_virtual($v) . " " . $v->{protocol}; } } sub ld_start { my $oldsrv; my $real_service; my $nv; my $nr; my $server_down = {}; # read status of current ipvsadm -L -n $oldsrv=&ld_read_ipvsadm(); # make sure virtual servers are up to date foreach $nv (@VIRTUAL) { my $real_service = &get_real_service_str($nv); if (exists($oldsrv->{"$real_service"})) { # service exists, modify it &system_wrapper("$IPVSADM -E $$nv{flags}"); &ld_log("Changed virtual server: " . &get_virtual($nv)); } else { # no such service, create a new one &system_wrapper("$IPVSADM -A $$nv{flags}"); &ld_log("Added virtual server: " . &get_virtual($nv)); } } # make sure real servers are up to date foreach $nv (@VIRTUAL) { my $nreal = $nv->{real}; my $ov = $oldsrv->{&get_real_service_str($nv)}; my $or = $ov->{real}; my $fallback = fallback_find($nv); if (defined($fallback)) { delete($or->{"$fallback->{server}:$fallback->{port}"}); } for $nr (@$nreal) { my $real_str = "$nr->{server}:$nr->{port}"; if (! defined($or->{$real_str}) or $or->{$real_str}->{weight} == 0) { $server_down->{$real_str} = [$nv, $nr]; #service_set($nv, $nr, "down", {force => 1}); } else { if (defined $server_down->{$real_str}) { delete($server_down->{$real_str}); } service_set($nv, $nr, "up", {force => 1}); } delete($or->{$real_str}); } # remove remaining entries for real servers for my $k (keys %$or) { purge_untracked_service($nv, $k, "start"); delete($$or{$k}); } delete($oldsrv->{&get_real_service_str($nv)}); &fallback_on($nv); } for my $k (keys (%$server_down)) { my $v = $server_down->{$k}; if ($READDQUIESCENT eq "no") { # Ensure that the server is initially added service_set(@$v[0], @$v[1], "up", {force => 1}); } # Remove Server service_set(@$v[0], @$v[1], "down", {force => 1}); delete($server_down->{$k}); #sleep 5; } # remove remaining entries for virtual servers foreach $nv (@OLDVIRTUAL) { if (! defined($oldsrv->{&get_real_service_str($nv)})) { next; } purge_virtual($nv, "start"); } + + # initial check of all real servers while we are still starting up so + # any e-mail notifications sent out are in 'starting' daemon status. + _ld_main_check_all(); } sub ld_cmd_children { my ($cmd, %children) = (@_); # instantiate other ldirectord, if specified my $child; foreach $child (keys %children) { if ($cmd eq "reload_or_start") { if (&system_wrapper("$LDIRECTORD $child reload")) { &system_wrapper("$LDIRECTORD $child start"); } } else { &system_wrapper("$LDIRECTORD $child $cmd"); } } } sub ld_stop { # Kill children if ($FORKING eq 'yes') { foreach my $virtual_id (keys (%FORK_CHILDREN)) { my $pid = $FORK_CHILDREN{$virtual_id}; ld_log("Killing child $virtual_id PID=$pid"); kill 15, $pid; } } foreach my $v (@VIRTUAL) { next if ( (! defined($$v{cleanstop}) and $CLEANSTOP eq 'no') or (defined($$v{cleanstop}) and $$v{cleanstop} eq 'no') ); my $real = $$v{real}; foreach my $r (@$real) { if (defined $$r{virtual_status}) { purge_service($v, $r, "stop"); } } purge_virtual($v, "stop"); } } sub ld_main { # Main failover checking code while (1) { if ($FORKING eq 'yes') { foreach my $v (@VIRTUAL) { my $virtual_id = get_virtual_id_str($v); if (!exists($FORK_CHILDREN{$virtual_id})) { &ld_log("Child not running for $virtual_id, spawning"); my $pid = fork; if (!defined($pid)) { &ld_log("fork failed"); } elsif ($pid == 0) { run_child($v); } else { $FORK_CHILDREN{get_virtual_id_str($v)} = $pid; &ld_log("Spawned child $virtual_id PID=$pid"); } } elsif (waitpid($FORK_CHILDREN{get_virtual_id_str($v)}, WNOHANG)) { delete $FORK_CHILDREN{get_virtual_id_str($v)}; } } check_signal(); if (!check_cfgfile()) { sleep 1; } check_signal(); } else { - my @real_checked; - foreach my $v (@VIRTUAL) { - my $real = $$v{real}; - my $virtual_id = get_virtual_id_str($v); + _ld_main_check_all(); - REAL: foreach my $r (@$real) { - my $real_id = get_real_id_str($r, $v); - check_signal(); - foreach my $tmp_id (@real_checked) { - if($real_id eq $tmp_id) { - &ld_debug(3, "Already checked: real server=$real_id (virtual=$virtual_id)"); - next REAL; - } - } - _check_real($v, $r); - push(@real_checked, $real_id); - } - } check_signal(); if (!check_cfgfile()) { sleep $CHECKINTERVAL; } check_signal(); ld_emailalert_resend(); check_signal(); } } } sub run_child { my $v = shift; # Just exit on signals $SIG{'INT'} = "DEFAULT"; $SIG{'QUIT'} = "DEFAULT"; $SIG{'ILL'} = "DEFAULT"; $SIG{'ABRT'} = "DEFAULT"; $SIG{'FPE'} = "DEFAULT"; $SIG{'SEGV'} = "DEFAULT"; $SIG{'TERM'} = "DEFAULT"; $SIG{'BUS'} = "DEFAULT"; $SIG{'SYS'} = "DEFAULT"; $SIG{'XCPU'} = "DEFAULT"; $SIG{'XFSZ'} = "DEFAULT"; $SIG{'IOT'} = "DEFAULT"; $SIG{'PIPE'} = "IGNORE"; $SIG{'HUP'} = sub { exit 1 }; my $real = $$v{real}; my $virtual_id = get_virtual_id_str($v); my $checkinterval = $$v{checkinterval} || $CHECKINTERVAL; + + # delete any entries in EMAILSTATUS that don't belong to this child + my %myservices = (); + foreach my $r (@$real) { + my $virtual_str = &get_virtual($v); + my $id = $r->{server} . ":" . $r->{port} . " ($virtual_str)"; + $myservices{$id} = 1; + } + foreach my $id (keys %EMAILSTATUS) { + delete $EMAILSTATUS{$id} unless defined $myservices{$id}; + } + $0 = "ldirectord $virtual_id"; while (1) { foreach my $r (@$real) { $0 = "ldirectord $virtual_id checking $$r{server}"; _check_real($v, $r); } $0 = "ldirectord $virtual_id"; sleep $checkinterval; ld_emailalert_resend(); } } +# run checks for everything +sub _ld_main_check_all +{ + my @real_checked; + + foreach my $v (@VIRTUAL) { + my $real = $$v{real}; + my $virtual_id = get_virtual_id_str($v); + + REAL: foreach my $r (@$real) { + my $real_id = get_real_id_str($r, $v); + check_signal(); + foreach my $tmp_id (@real_checked) { + if($real_id eq $tmp_id) { + &ld_debug(3, "Already checked: real server=$real_id (virtual=$virtual_id)"); + next REAL; + } + } + _check_real($v, $r); + push(@real_checked, $real_id); + } + } +} + sub _check_real { my $v = shift; my $r = shift; my $real_id = get_real_id_str($r, $v); my $virtual_id = get_virtual_id_str($v); if (_check_real_for_maintenance($r)) { service_set($v, $r, "down", {do_log => 1, force => 1}, "Server in maintenance"); return; } elsif ($$v{checktype} eq "negotiate" || $$r{num_connects}>=$$v{num_connects}) { &ld_debug(2, "Checking negotiate: real server=$real_id (virtual=$virtual_id)"); if (grep $$v{service} eq $_, ("http", "https", "http_proxy")) { $$r{num_connects} = 0 if (check_http($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "pop") { $$r{num_connects} = 0 if (check_pop($v, $r, 0) == $SERVICE_UP); } elsif ($$v{service} eq "pops") { $$r{num_connects} = 0 if (check_pop($v, $r, 1) == $SERVICE_UP); } elsif ($$v{service} eq "imap") { $$r{num_connects} = 0 if (check_imap($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "imaps") { $$r{num_connects} = 0 if (check_imaps($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "smtp" or $$v{service} eq "submission") { $$r{num_connects} = 0 if (check_smtp($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "ftp") { $$r{num_connects} = 0 if (check_ftp($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "ldap") { $$r{num_connects} = 0 if (check_ldap($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "nntp") { $$r{num_connects} = 0 if (check_nntp($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "dns") { $$r{num_connects} = 0 if (check_dns($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "sip") { $$r{num_connects} = 0 if (check_sip($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "radius") { $$r{num_connects} = 0 if (check_radius($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "mysql") { $$r{num_connects} = 0 if (check_mysql($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "pgsql") { $$r{num_connects} = 0 if (check_pgsql($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "oracle") { $$r{num_connects} = 0 if (check_oracle($v, $r) == $SERVICE_UP); } elsif ($$v{service} eq "simpletcp") { $$r{num_connects} = 0 if (check_simpletcp($v, $r) == $SERVICE_UP); } else { $$r{num_connects} = 0 if (check_none($v, $r) == $SERVICE_UP); } } elsif ($$v{checktype} eq "connect") { if ($$v{protocol} ne "udp") { &ld_debug(2, "Checking connect: real server=$real_id (virtual=$virtual_id)"); check_connect($v, $r); } else { &ld_debug(2, "Checking connect (ping): real server=$real_id (virtual=$virtual_id)"); check_ping($v, $r); } } elsif ($$v{checktype} eq "ping") { &ld_debug(2, "Checking ping: real server=$real_id (virtual=$virtual_id)"); check_ping($v, $r); } elsif ($$v{checktype} eq "external") { &ld_debug(2, "Checking external: real server=$real_id (virtual=$virtual_id)"); check_external($v, $r); } elsif ($$v{checktype} eq "external-perl") { &ld_debug(2, "Checking external-perl: real server=$real_id (virtual=$virtual_id)"); check_external_perl($v, $r); } elsif ($$v{checktype} eq "off") { &ld_debug(2, "Checking off: No real or fallback servers to be added\n"); } elsif ($$v{checktype} eq "on") { &ld_debug(2, "Checking on: Real servers are added without any checks\n"); &service_set($v, $r, "up"); } elsif ($$v{checktype} eq "combined") { &ld_debug(2, "Checking combined-connect: real server=$real_id (virtual=$virtual_id)"); if (check_connect($v, $r) == $SERVICE_UP) { $$r{num_connects}++; } else { $$r{num_connects} = 999999; } } } sub _check_real_for_maintenance { my $r = shift; return undef if(!$MAINTDIR); my $servername = ld_gethostbyaddr($$r{server}); # Extract just the first component of the full name so we can match short or FQDN names $servername =~ /^([a-z][a-z0-9\-]+)\./; my $servershortname = $1; if (-e "$MAINTDIR/$$r{server}:$$r{port}") { &ld_debug(2, "Server maintenance: Found file $$r{server}:$$r{port}"); return 1; } elsif (-e "$MAINTDIR/$$r{server}") { &ld_debug(2, "Server maintenance: Found file $$r{server}"); return 1; } elsif ($servername && -e "$MAINTDIR/$servername:$$r{port}") { &ld_debug(2, "Server maintenance: Found file $servername:$$r{port}"); return 1; } elsif ($servername && -e "$MAINTDIR/$servername") { &ld_debug(2, "Server maintenance: Found file $servername"); return 1; } elsif ($servershortname && -e "$MAINTDIR/$servershortname:$$r{port}") { &ld_debug(2, "Server maintenance: Found file $servershortname:$$r{port}"); return 1; } elsif ($servershortname && -e "$MAINTDIR/$servershortname") { &ld_debug(2, "Server maintenance: Found file $servershortname"); return 1; } return undef; } sub check_http { use LWP::UserAgent; use LWP::Debug; + use Net::HTTP; + use URI; if($DEBUG > 2) { LWP::Debug::level('+'); } my ($v, $r) = @_; my $host = $$r{server}; my $virtualhost = (defined $$v{virtualhost} ? $$v{virtualhost} : $host); &ld_debug(2, "check_http: url=\"$$r{url}\" " . "virtualhost=\"$virtualhost\""); if (inet_pton(AF_INET6,&ld_strip_brackets($host))) { - no warnings 'once'; require Net::INET6Glue::INET_is_INET6; - # Workaround for Net-HTTP IPv6 Address URLs Broken + } + + # Workaround for Net-HTTP IPv6 Address URLs Broken + if ($LWP::VERSION < 6.08 || $Net::HTTP::VERSION < 6.07 || $URI::VERSION < 1.64) { + no warnings 'once'; @LWP::Protocol::http::EXTRA_SOCK_OPTS = (PeerAddr => $host, PeerHost => &ld_strip_brackets($host), Host => &ld_strip_brackets($host)); } my $ua = new LWP::UserAgent(ssl_opts => { verify_hostname => 0 }); my $h = undef; if ($$v{service} eq "http_proxy") { my $port = ld_checkport($v, $r); $ua->proxy("http", "http://$$r{server}:$port/"); } else { $h = new HTTP::Headers("Host" => $virtualhost); } my $req = new HTTP::Request("$$v{httpmethod}", "$$r{url}", $h); my $res; # LWP does not seem to honour timeouts set using $ua->timeout() # for HTTPS. So use an alarm instead. This also has the advantage # of being cumulative timeout, rather than a per send/receive # timeout. eval { # LWP makes unguarded calls to eval # which throw a fatal exception if they fail # Needless to say, this is completely stupid. # Resetting of $SIG{'__DIE__'} is also # needed now that alarm() is used. local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{negotiatetimeout}"); &ld_debug(2, "Starting Check"); alarm $$v{negotiatetimeout}; &ld_debug(2, "Starting HTTP/HTTPS"); $res = $ua->request($req); &ld_debug(2, "Finished HTTP/HTTPS"); alarm 0; # Cancel the alarm }; if (not defined $res) { &ld_debug(2, "check_http: timeout"); goto down; } if ($$v{service} eq "https") { &ld_debug(2, "SSL-Cipher: " . ($res->header('Client-SSL-Cipher') || '')); &ld_debug(2, "SSL-Cert-Subject: " . ($res->header('Client-SSL-Cert-Subject') || '')); &ld_debug(2, "SSL-Cert-Issuer: " . ($res->header('Client-SSL-Cert-Issuer') || '')); } &ld_debug(2, "Return status: " . $res->status_line); my $recstr = $$r{receive}; if ($res->is_success && (!($recstr =~ /.+/) || $res->content =~ /$recstr/)) { service_set($v, $r, "up", {do_log => 1}, $res->status_line); &ld_debug(2, "check_http: $$r{url} is up\n"); return $SERVICE_UP; } my $log_message = $res->is_success ? $res->content : $res->status_line; service_set($v, $r, "down", {do_log => 1}, $log_message); &ld_debug(3, "Headers " . $res->headers->as_string); down: &ld_debug(2, "check_http: $$r{url} is down\n"); return $SERVICE_DOWN; } sub check_smtp { require Net::SMTP; my ($v, $r) = @_; my $port = ld_checkport($v, $r); &ld_debug(2, "Checking $$v{service}: server=$$r{server} port=$port"); my $smtp = new Net::SMTP($$r{server}, Port => $port, Timeout => $$v{negotiatetimeout}); if ($smtp) { $smtp->quit; service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } else { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } } sub check_pop { require Mail::POP3Client; my ($v, $r, $ssl) = @_; my $port = ld_checkport($v, $r); - &ld_debug(2, "Checking pop server=$$r{server} port=$port ssl=$ssl"); + &ld_debug(2, "Checking POP3 server=$$r{server} port=$port ssl=$ssl"); my $pop = new Mail::POP3Client(USER => $$v{login}, PASSWORD => $$v{passwd}, HOST => $$r{server}, USESSL => $ssl, PORT => $port, DEBUG => 0, TIMEOUT => $$v{negotiatetimeout}); if (!$pop) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } if($$v{login} ne "") { my $authres = $pop->login(); $pop->close(); if (!$authres) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } } $pop->close(); service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } sub check_imap { require Net::IMAP::Simple; my ($v, $r) = @_; my $port = ld_checkport($v, $r); - &ld_debug(2, "Checking imap server=$$r{server} port=$port"); + &ld_debug(2, "Checking IMAP server=$$r{server} port=$port"); my $imap = Net::IMAP::Simple->new($$r{server}, port => $port, timeout => $$v{negotiatetimeout}); if (!$imap) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } if($$v{login} ne "") { my $authres = $imap->login($$v{login},$$v{passwd}); $imap->quit; if (!$authres) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } } $imap->quit(); service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } sub check_imaps { require Net::IMAP::Simple::SSL; my ($v, $r) = @_; my $port = ld_checkport($v, $r); - &ld_debug(2, "Checking imaps server=$$r{server} port=$port"); + &ld_debug(2, "Checking IMAPS server=$$r{server} port=$port"); my $imaps = Net::IMAP::Simple::SSL->new($$r{server}, port => $port, timeout => $$v{negotiatetimeout}); if (!$imaps) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } if($$v{login} ne "") { my $authres = $imaps->login($$v{login},$$v{passwd}); $imaps->quit; if (!$authres) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } } $imaps->quit(); service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } sub check_ldap { my ($v, $r) = @_; require Net::LDAP; my $port = ld_checkport($v, $r); my $result; my $recstr = $$r{receive}; - &ld_debug(2, "Checking ldap server=$$r{server} port=$port"); + &ld_debug(2, "Checking LDAP server=$$r{server} port=$port"); eval { local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{checktimeout}"); &ld_debug(2, "Starting Check"); alarm $$v{checktimeout}; my $ldap = Net::LDAP->new("$$r{server}", port => $port, timeout => $$v{negotiatetimeout}); if(!$ldap) { service_set($v, $r, "down", {do_log => 1}, "Connection failed"); &ld_debug(4, "Connection failed"); alarm 0; # Cancel the alarm return $SERVICE_DOWN; } my $mesg; if ($$v{login} && $$v{passwd}) { $mesg = $ldap->bind($$v{login}, password=>$$v{passwd}) ; } else { $mesg = $ldap->bind ; } if ($mesg->is_error) { service_set($v, $r, "down", {do_log => 1}, "Bind failed"); &ld_debug(4, "Bind failed"); alarm 0; # Cancel the alarm return $SERVICE_DOWN; } &ld_debug(4, "Base : " . substr($$r{request},1)); $result = $ldap->search ( base => substr($$r{request},1) . "", scope => "base", filter => "(objectClass=*)" ); alarm 0; # Cancel the alarm }; if (!defined($result)) { service_set($v, $r, "down", {do_log => 1}, "No answer received"); &ld_debug(2, "check timeout alarm"); return $SERVICE_DOWN; } if($result->count != 1) { service_set($v, $r, "down", {do_log => 1}, "No answer received"); &ld_debug(2, "Count failed : " . $result->count); return $SERVICE_DOWN; } my $href = $result->as_struct; my @arrayOfDNs = keys %$href ; if (!($recstr =~ /.+/) || $arrayOfDNs[0] =~ /$recstr/) { service_set($v, $r, "up", {do_log => 1}, "Success"); return $SERVICE_UP; } else { service_set($v, $r, "down", {do_log => 1}, "Response mismatch"); &ld_debug(4,"Message differs : " . ", " . $$r{receive} . ", " . $arrayOfDNs[0] . "."); return $SERVICE_DOWN; } } sub check_nntp { use IO::Socket; use IO::Socket::INET6; use IO::Select; my ($v, $r) = @_; my $sock; my $s; my $buf; my $port = ld_checkport($v, $r); my $status = 1; - &ld_debug(2, "Checking nntp server=$$r{server} port=$port"); + &ld_debug(2, "Checking NNTP server=$$r{server} port=$port"); unless ($sock = IO::Socket::INET6->new(PeerAddr => $$r{server}, PeerPort => $port, Proto => 'tcp', TimeOut => $$v{negotiatetimeout})) { service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } $s = IO::Select->new(); $s->add($sock); if (scalar($s->can_read($$v{negotiatetimeout})) == 0) { service_set($v, $r, "down", {do_log => 1}); } else { sysread($sock, $buf, 64); if ($buf =~ /^2/) { service_set($v, $r, "up", {do_log => 1}); $status = 0; } else { service_set($v, $r, "down", {do_log => 1}); } } $s->remove($sock); $sock->close; return $status; } sub check_radius { require Authen::Radius; my ($v, $r) = @_; - &ld_debug(2, "Checking radius"); + &ld_debug(2, "Checking RADIUS"); my $port = ld_checkport($v, $r); my $radius; my $result = ""; eval { local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{checktimeout}"); &ld_debug(2, "Starting Check"); alarm $$v{checktimeout}; - &ld_debug(2, "Starting Radius"); + &ld_debug(2, "Starting RADIUS"); $radius = new Authen::Radius(Host => "$$r{server}:$port", Secret=>$$v{secret}, TimeOut=>$$v{negotiatetimeout}, Errmode=>'die'); $result = $radius->check_pwd($$v{login}, $$v{passwd}); - &ld_debug(2, "Finished Radius"); + &ld_debug(2, "Finished RADIUS"); alarm 0; # Cancel the alarm }; if ($result eq "") { &service_set($v, $r, "down", {do_log => 1}); &ld_debug(3, "Deactivated service $$r{server}:$$r{port}: $@"); - &ld_debug(3, "Radius Error: ".$radius->get_error); + &ld_debug(3, "RADIUS Error: ".$radius->get_error); return $SERVICE_DOWN; } else { &service_set($v, $r, "up", {do_log => 1}); &ld_debug(3, "Activated service $$r{server}:$$r{port}"); return $SERVICE_UP; } } sub check_mysql { return check_sql(@_, "mysql", "database"); } sub check_pgsql { return check_sql(@_, "Pg", "dbname"); } sub check_sql_log_errstr { my ($prefix, $errstr) = (@_); for $_ (split /\n/, $errstr) { &ld_debug(4, "$prefix $_\n"); } } sub check_oracle { return check_sql(@_, "Oracle", "sid"); } sub check_sql { require DBI; my ($v, $r, $dbd, $dbname) = @_; my $port = ld_checkport($v, $r); my ($dbh, $sth, $query, $rows, $result); $result = $SERVICE_DOWN; $query = $$r{request}; $query =~ s#^/##; unless ($$v{login} && $query) { &ld_log("Error: Must specify a login and request string " . "for MySQL, Oracle and PostgreSQL checks. " . "Not adding $$r{server}.\n"); goto err_down; } $result=2; # Set result flag. Only ok if ends up at zero. &ld_debug(2, "Checking $$v{server} server=$$r{server} port=$port\n"); $dbh = DBI->connect("dbi:$dbd:$dbname=$$v{database};" . "host=$$r{server};port=$port", $$v{login}, $$v{passwd}); unless ($dbh) { &ld_debug(2, "Failed to bind to $$r{server} with DBI->errstr\n"); check_sql_log_errstr("Failed to bind to $$r{server} with", DBI->errstr); goto err_down; } $result--; $sth = $dbh->prepare($query); unless ($sth) { &ld_debug(2, "Error preparing statement: $dbh->errstr\n"); check_sql_log_errstr("Error preparing statement:", $dbh->errstr); goto err_disconect; } # Test to see if any errors are returned $sth->execute; if ($dbh->err) { &ld_debug(2, "Error executing statement: $dbh->errstr : $dbh->err\n"); check_sql_log_errstr("Error executing statement:", $dbh->errstr, $dbh->err); goto err_finish; } # On error "execute" will return undef. # # Assuming you're using 'SELECT' you will get the number of rows # returned from the db when running execute: the 'rows' method is # only used when doing something that is NOT a select. I cannot # imagine that you would ever want to insert or update from a # regular polling on this system, so we will assume you are using # SELECT here. # # Ideally you will do something like this: 'select * from # director_slave where enabled=1' This way you can have, say, a # MEMORY table in MySQL where you insert a value into a row # (enabled) that says whether or not you want to actually use this # in the pool from ldirector / ipvs, and disable them without # actually turning off your sql server. $sth->execute; if ($dbd eq "Oracle") { $sth->fetchrow_hashref() } unless ($rows = $sth->rows) { check_sql_log_errstr("Error executing statement:", $dbh->errstr, $dbh->err); goto err_finish; } # Actually look to see if there was data returned in this statement, # else disable node if($rows > 0) { goto out; } else { goto err_finish; } out: $result = $SERVICE_UP; err_finish: $sth->finish(); err_disconnect: $dbh->disconnect(); err_down: service_set($v, $r, $result == $SERVICE_UP ? "up" : "down", {do_log => 1}); return $result; } sub check_connect { my ($v, $r) = @_; my $port = ld_checkport($v, $r); eval { local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{checktimeout}"); alarm $$v{checktimeout}; my $sock = &ld_open_socket($$r{server}, $port, $$v{protocol}); if ($sock) { close($sock); } else { alarm 0; # Cancel the alarm die("Socket Connect Failed"); } &ld_debug(3, "Connected to $$r{server} (port $port)"); alarm 0; # Cancel the alarm }; if ($@) { &service_set($v, $r, "down", {do_log => 1}); &ld_debug(3, "Deactivated service $$r{server}:$$r{port}: $@"); return $SERVICE_DOWN; } else { &service_set($v, $r, "up", {do_log => 1}); &ld_debug(3, "Activated service $$r{server}:$$r{port}"); return $SERVICE_UP; } } sub check_external { my ($v, $r) = @_; my $v_server; if (defined $$v{server}) { $v_server = $$v{server}; } else { $v_server = $$v{fwm}; } my $result = system_timeout($$v{checktimeout}, $$v{checkcommand}, $v_server, $$v{port}, $$r{server}, $$r{port}); if ($result) { &service_set($v, $r, "down", {do_log => 1}); &ld_debug(3, "Deactivated service $$r{server}:$$r{port}: " . "$@ after calling $$v{checkcommand} with result " . "$result"); return 0; } else { &service_set($v, $r, "up", {do_log => 1}); &ld_debug(3, "Activated service $$r{server}:$$r{port}"); return 1; } } sub check_external_perl { my ($v, $r) = @_; my $result; my $v_server; eval { local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{checktimeout}"); alarm $$v{checktimeout}; if (defined $$v{server}) { $v_server = $$v{server}; } else { $v_server = $$v{fwm}; } my $cmdfunc = $check_external_perl__funcs{$$v{checkcommand}}; if (!defined($cmdfunc)) { open(CMDFILE, "<$$v{checkcommand}") || die "cannot open external-perl checkcommand file: $$v{checkcommand}"; $cmdfunc = eval("sub { \@ARGV=\@_; " . join("", ) . " }"); close(CMDFILE); $check_external_perl__funcs{$$v{checkcommand}} = $cmdfunc; } no warnings 'redefine'; local *CORE::GLOBAL::exit = sub { $result = shift; goto external_exit; }; $cmdfunc->($v_server, $$v{port}, $$r{server}, $$r{port}); external_exit: alarm 0; }; if ($@ or $result != 0) { &service_set($v, $r, "down"); &ld_debug(3, "Deactivated service $$r{server}:$$r{port}: " . "$@ after calling (external-perl) $$v{checkcommand} with result " . "$result"); return 0; } else { &service_set($v, $r, "up"); &ld_debug(3, "Activated service $$r{server}:$$r{port}"); return 1; } } sub check_sip { my ($v, $r) = @_; my $sip_d_port = ld_checkport($v, $r); - &ld_debug(2, "Checking sip server=$$r{server} port=$sip_d_port"); + &ld_debug(2, "Checking SIP server=$$r{server} port=$sip_d_port"); eval { use Socket; local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{checktimeout}"); alarm $$v{negotiatetimeout}; my $sock = &ld_open_socket($$r{server}, $sip_d_port, $$v{protocol}); unless ($sock) { alarm 0; die("Socket Connect Failed"); } my ($sip_s_addr_str, $sip_s_port) = &ld_get_addrport($sock); &ld_debug(3, "Connected from $sip_s_addr_str:$sip_s_port to " . $$r{server} . ":$sip_d_port"); select $sock; $|=1; select STDOUT; my $request = "OPTIONS sip:" . $$v{login} . " SIP/2.0\r\n" . "Via: SIP/2.0/UDP $sip_s_addr_str:$sip_s_port;" . "branch=z9hG4bKhjhs8ass877\r\n" . "Max-Forwards: 70\r\n" . "To: \r\n" . "From: ;tag=1928301774\r\n" . "Call-ID: " . (join "", map { unpack "H*", chr(rand(256)) } 1..8) . "\r\n" . "CSeq: 63104 OPTIONS\r\n" . "Contact: \r\n" . "Accept: application/sdp\r\n" . "Content-Length: 0\r\n\r\n"; print "Request:\n$request"; print $sock $request; my $ok; my $reply; while (<$sock>) { chomp; $/="\r"; chomp; $/="\n"; last if ($_ eq ""); if (!defined $ok) { # Check status $ok = $_; if ($ok !~ m/^SIP\/2.0 200 OK/) { alarm 0; # Cancel the alarm close($sock); die "$ok\n"; } next; } $reply .= "$_\n"; # Add more checks here as desired } alarm 0; # Cancel the alarm close($sock); if (!defined $ok) { die "No OK\n"; } print "Reply:\n$ok\n$reply\n"; }; if ($@) { &service_set($v, $r, "down", {do_log => 1}); &ld_debug(3, "Deactivated service $$r{server}:$$r{port}: $@"); return $SERVICE_DOWN; } else { &service_set($v, $r, "up", {do_log => 1}); &ld_debug(3, "Activated service $$r{server}:$$r{port}"); return $SERVICE_UP; } } sub check_simpletcp { my ($v, $r) = @_; my $d_port = ld_checkport($v, $r); &ld_debug(2, "Checking simpletcp server=$$r{server} port=$d_port"); eval { use Socket; local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "Timeout Alarm" }; &ld_debug(4, "Timeout is $$v{checktimeout}"); alarm $$v{negotiatetimeout}; my $sock = &ld_open_socket($$r{server}, $d_port, $$v{protocol}); unless ($sock) { alarm 0; die("Socket Connect Failed"); } my ($s_addr_str, $s_port) = &ld_get_addrport($sock); &ld_debug(3, "Connected from $s_addr_str:$s_port to " . $$r{server} . ":$d_port"); select $sock; $|=1; select STDOUT; my $request = substr($$r{request}, 1); $request =~ s/\\n/\n/g ; &ld_debug(2, "Checking simpletcp server=$$r{server} port=$d_port request:\n$request"); print $sock $request; shutdown($sock, SHUT_WR); my $ok; my $reply; while (<$sock>) { &ld_debug(2, "Checking simpletcp server=$$r{server} port=$d_port receive=" . $$r{receive} ." got: $_\n"); if ( $_ =~ /$$r{receive}/ ) { $ok = 1; last; } } alarm 0; # Cancel the alarm close($sock); if (!defined $ok) { die "No OK\n"; } }; if ($@) { &service_set($v, $r, "down", {do_log => 1}); &ld_debug(3, "Deactivated service $$r{server}:$$r{port}: $@"); return $SERVICE_DOWN; } else { &service_set($v, $r, "up", {do_log => 1}); &ld_debug(3, "Activated service $$r{server}:$$r{port}"); return $SERVICE_UP; } } sub check_ftp { require Net::FTP; my ($v, $r) = @_; my $ftp; my $memory; my $debug = ($DEBUG > 2) ? 1 : 0; my $port = ld_checkport($v, $r); - &ld_debug(2, "Checking ftp server=$$r{server} port=$port"); + &ld_debug(2, "Checking FTP server=$$r{server} port=$port"); &ld_debug(4, "Timeout is $$v{negotiatetimeout}"); open(TMP,'+>', undef); # In some cases Net::FTP dies if there is a timeout eval { unless ($ftp = Net::FTP->new("$$r{server}:$port", Timeout=>$$v{negotiatetimeout}, Debug=>$debug)) { die "Could not connect\n"; } $ftp->login($$v{login}, $$v{passwd}); $ftp->cwd("/"); $ftp->binary(); $ftp->pasv(); $ftp->get("$$r{request}", *TMP); $ftp->quit(); }; if ($@) { &ld_debug(2, "Warning: $@"); } seek TMP, 0, 0; local $/; $memory = ; close TMP; if ($memory =~ /$$r{receive}/) { service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } service_set($v, $r, "down", {do_log => 1}); return $SERVICE_DOWN; } sub check_dns { my $res; my $query; my $rr; my $request; my $server; my ($v,$r) = @_; my $port = ld_checkport($v, $r); { # Net::DNS makes unguarded calls to eval # which throw a fatal exception if they fail # Needless to say, this is completely stupid. local $SIG{'__DIE__'} = "DEFAULT"; # When fork=yes we need to ignore the child death local $SIG{'CHLD'} = "IGNORE"; require Net::DNS; } $res = new Net::DNS::Resolver; if($DEBUG > 2) { $res->debug(1); } $$r{"request"} =~ m/^\/?(.*)/; $request=$1; $server = &ld_strip_brackets($$r{server}); - &ld_debug(2, "Checking dns: request=\"$request\" receive=\"" + &ld_debug(2, "Checking DNS: request=\"$request\" receive=\"" . $$r{"receive"} . "\"\n"); eval { local $SIG{'__DIE__'} = "DEFAULT"; local $SIG{'ALRM'} = sub { die "timeout\n"; }; alarm($$v{negotiatetimeout}); $res->nameservers($server); $res->port($port); if ($$v{"protocol"} eq "tcp") { $res->usevc(1); } $query = $res->search($request); alarm(0); }; if (@$ eq "timeout\n" or ! $query) { service_set($v, $r, "down", {do_log => 1}, "Connection timed out"); return $SERVICE_DOWN; } my $recstr = $$r{receive}; foreach $rr ($query->answer) { if (($rr->type eq "A" and length($recstr) and $rr->address =~ /$recstr/) or ($rr->type eq "PTR" and length($recstr) and $rr->ptrdname =~ /$recstr/)) { service_set($v, $r, "up", {do_log => 1}, "Success"); return $SERVICE_UP; } } service_set($v, $r, "down", {do_log => 1}, "Response mismatch"); return $SERVICE_DOWN; } sub check_ping { use Net::Ping; my ($v,$r) = (@_); &ld_debug(2, "Checking ping: " . "host=\"" . $$r{server} . "\" checktimeout=\"" . $$v{"checktimeout"} . "\" checkcount=\"" . $$v{"checkcount"} . "\"\n"); my $p = Net::Ping->new("icmp","1","64"); for (my $attempt = 0; $attempt < $$v{"checkcount"}; $attempt++) { if ($p->ping($$r{server}, $$v{"checktimeout"})) { &ld_debug(2, "pong from $$r{server}\n"); service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } &ld_debug(2, "ping to $$r{server} timed out " . "(attempt " . ($attempt + 1) . "/" . $$v{"checkcount"} . ")\n"); } service_set($v, $r, "down"); return $SERVICE_DOWN; } # check_none # Dummy function to check service if service type is none. # Just activates the real server sub check_none { my ($v, $r) = @_; &ld_debug(2, "Checking none"); service_set($v, $r, "up", {do_log => 1}); return $SERVICE_UP; } # service_set # Used to bring up and down real servers. # This is the function you should call if you want to bring a real # server up or down. # This function is safe to call regardless of the current state of a # real server. # Do _not_ call _service_up or _service_down directly. # pre: v: virtual that the real service belongs to # Only used to determine the protocol of the service # r: real server to take down # state: up or down # up to bring the real service up # down to bring the real service up # flags: hash with the following (optional) keys: # force => 1 - force setting of the specified state # do_log => 1 - log the state to the monitorfile # (when called as the result of a check) # post: The real server is brought up or down for each virtual service # it belongs to. # return: none sub service_set { my ($v, $r, $state, $flags, $log_msg) = @_; my ($real, $virtual, $virt, $now); if ($$flags{'do_log'}) { $now = localtime(); if (!defined($log_msg)) { $log_msg = "-"; } # URI-escape special log characters ('|' and newlines) $log_msg =~ s/([%|\r\n])/sprintf("%%%.2x", ord($1))/eg; } # Find the real server in @REAL foreach $real (@REAL) { if($real->{"real"} eq get_real_id_str($r, $v)) { $virtual = $real->{"virtual"}; last; } } return unless (defined($virtual)); # Check each virtual service for the real server and make # changes as necessary foreach $v (@VIRTUAL){ # Use found rather than relying on tmp_id being # set when we leave the foreach loop. There # seems to some weirdness in Perl (5.6.0 on Redhat 7.2) my $found = 0; my $tmp_id; my $virtual_id = get_virtual_id_str($v); my $real_id = get_real_id_str($r, $v); my $log_str = "real server=$real_id" . " (virtual=$virtual_id)"; foreach $tmp_id (@$virtual) { if($virtual_id eq $tmp_id) { $found = 1; last; } } if ($found == 1) { if ($state=~/up/i) { _service_up($v, $r, $$flags{"force"}); &ld_debug(2, "Enabled $log_str"); } elsif ($state=~/down/i) { _service_down($v, $r, $$flags{"force"}); &ld_debug(2, "Disabled $log_str"); } if ($$v{"monitorfile"} and $$flags{"do_log"}) { my $real_log_msg = $real_id; $real_log_msg =~ tr/:/ /s; $real_log_msg =~ s/\\//g; unless( open(CHECKLOG, ">>$$v{monitorfile}") and print CHECKLOG "[$now] [$$] $real_log_msg [$state] $log_msg\n" and close(CHECKLOG) ) { die("Error writing to monitorfile '$$v{monitorfile}': $!"); } } } } } # _remove_service # Remove a real server by either making it quiescent or deleting it # Should be called by _service_down or fallback_off # I.e. If you want to change the state of a real server call service_set. # If you call this function directly then ldirectord will lose track # of the state of real servers. # If the real server exists (which it should) make it quiescent or # delete it, depending on the global and per virtual service quiescent flag. # If it # doesn't exist, just leave it as it will be added by the # _service_up code as appropriate. # pre: v: reference to virtual service to with the real server belongs # rservice: service to restore. Of the form server:port for a tcp or # udp service. Of the form fwmark for a fwm service. # rforw: Forwarding mechanism of service. Sould be one of "-g" "-i" or # "-m" # tag: Tag to use for logging. Should be either "real" or "fallback" # post: real service is taken up from the respective virtual service # if it is inactive # return: none sub _remove_service { my ($v, $rservice, $rforw, $tag) = (@_); my $oldsrv; my $ov; my $or; my $ipvsadm_args; my $log_args; my $virtual_str; my $old_rservice; my $is_quiescent; $virtual_str = &get_virtual($v); $oldsrv=&ld_read_ipvsadm(); $ov=$oldsrv->{&get_real_service_str($v)}; if(!defined($ov)){ return; } if ($tag ne "fallback" and ((defined $$v{quiescent} and $$v{quiescent} eq "yes") or (!defined($$v{quiescent}) and $QUIESCENT eq "yes"))){ $is_quiescent = "quiescent"; } $or=$ov->{"real"}->{$rservice}; # If a virtual service is a IP/port service (not fwmark) # and a real-servers uses a forwarding mechanism other than masq # then the port will always be that of the virtual service. # This includes real-servers that LVS has set to use # the local forwarding mechanism because their IP address # is local. Thus, if $rservice does not exist test # for the same ip address with the virtual servers port. # N.B: This could cause strange things to happen if # there is a clash between two real servers on different ports # that LVS has mapped to being the same thing. if(!defined($or)) { $old_rservice = $rservice; $rservice =~ /(.*):(.*)/; $rservice = $1; $virtual_str =~ /(.*):(.*)/; $rservice .= ":" . $2; $or=$ov->{"real"}->{$rservice}; # If this doesn't exist either, use the original service. # Otherwise if masq and quiescence is in use, the # real server is not local, and it has an alternate port to # the virtual server, using the mapped service will # result in a quiescent service being created on the # virtual server's port, which is not wanted. if(!defined($or)) { $rservice = $old_rservice; $old_rservice = undef; } } if((!defined($or) and !defined($is_quiescent)) or (defined($is_quiescent) and defined($or) and $or->{"weight"} eq 0 and get_forward_flag($or->{"forward"}) eq $rforw)){ return; } $ipvsadm_args = "$$v{proto} " . &get_virtual_option($v) . " -r $rservice"; $log_args = "$tag server: $rservice "; if(defined($old_rservice)) { $log_args .= "mapped from $old_rservice " } $log_args .= "($virtual_str)"; my $server_str=$rservice . " " . $virtual_str; my $currenttime=time(); if(defined($is_quiescent)) { if (defined($or)) { &system_wrapper("$IPVSADM -e " . "$ipvsadm_args $rforw -w 0"); &ld_log("Quiescent $log_args (Weight set to 0)"); &ld_emailalert_send("Quiescent $log_args (Weight set to 0)", $v, $rservice, $currenttime); } elsif ($READDQUIESCENT eq "yes") { &system_wrapper("$IPVSADM -a " . "$ipvsadm_args $rforw -w 0"); &ld_log("Readd Quiescent $log_args (Weight set to 0)"); &ld_emailalert_send("Quiescent $log_args (Weight set to 0)", $v, $rservice, $currenttime); } } else { &system_wrapper("$IPVSADM -d $ipvsadm_args"); &ld_log("Deleted $log_args"); &ld_emailalert_send("Deleted $log_args", $v, $rservice, $tag eq "fallback" ? 0 : $currenttime); } } # _restore_service # Make a retore a real server. The opposite of _quiescent_server. # Should be called by _service_up or fallback_on # I.e. If you want to change the state of a real server call service_set. # If you call this function directly then ldirectord will lose track # of the state of real servers. # If the real server exists (which it should) make it quiescent. If it # doesn't exist, just leave it as it will be added by the _service_up code # as appropriate. # pre: v: reference to virtual service to with the real server belongs # rservice: service to restore. Of the form server:port for a tcp or # udp service. Of the form fwmark for a fwm service. # rforw: Forwarding mechanism of service. Sould be one of "-g" "-i" or # "-m" # rwght: Weight of service. Sold be of the form "" # e.g. "1" # tag: Tag to use for logging. Should be either "real" or "fallback" # post: real service is taken up from the respective virtual service # if it is inactive # return: none sub _restore_service { my ($v, $rservice, $rforw, $rwght, $tag) = (@_); my $oldsrv; my $ov; my $or; my $ipvsadm_args; my $log_args; $ipvsadm_args = "$$v{proto} " . &get_virtual_option($v) . " -r $rservice $rforw -w $rwght"; $log_args = "$tag server: $rservice " . "(" #. scalar(%{$v->{real_status}}) . &get_virtual($v) . ")"; #if the server exists then restore its weight # otherwise add the server $oldsrv=&ld_read_ipvsadm(); $ov=$oldsrv->{&get_real_service_str($v)}; if(defined($ov)){ $or=$ov->{"real"}->{$rservice}; } if(defined($or)){ unless($or->{"weight"} eq $rwght and get_forward_flag($or->{"forward"}) eq $rforw){ &system_wrapper("$IPVSADM -e $ipvsadm_args"); &ld_log("Restored $log_args (Weight set to $rwght)"); &ld_emailalert_send("Restored $log_args " . "(Weight set to $rwght)", $v, $rservice, 0); } } else { &system_wrapper("$IPVSADM -a $ipvsadm_args"); &ld_log("Added $log_args (Weight set to $rwght)"); &ld_emailalert_send("Added $log_args (Weight set to $rwght)", $v, $rservice, 0); } } # Check the status of a server # Should only be called from _status_up, _status_down, # _service_up, or _service_down # Returns 1 if the server is up, 0 if down sub _status_check { my ($v, $r, $is_fallback) = (@_); my $virtual_id = get_virtual_id_str($v); my $real_id = get_real_id_str($r, $v); if (defined($is_fallback)) { if (defined($v->{real_status}) or (defined($v->{fallback_status}) and $v->{fallback_status}->{"$real_id"})) { return 1; } } else { if (defined ($v->{real_status}) and $v->{real_status}->{"$real_id"}) { return 1; } } return 0; } # Set the status of a server as up # Should only be called from _service_up or _ld_start sub _status_up { my ($v, $r, $is_fallback) = (@_); my $virtual_id = get_virtual_id_str($v); my $real_id = get_real_id_str($r, $v); return undef if(_status_check($v, $r, $is_fallback)); $r->{virtual_status}->{"$virtual_id"} = 1; if (defined $is_fallback) { $v->{fallback_status}->{"$real_id"} = 1; } else { $v->{real_status}->{"$real_id"} = 1; } return 1; } # Set the status of a server as down # Should only be called from _service_down or ld_stop sub _status_down { my ($v, $r, $is_fallback) = (@_); my $virtual_id = get_virtual_id_str($v); my $real_id = get_real_id_str($r, $v); return undef if (!_status_check($v, $r, $is_fallback)); if (defined($is_fallback)) { delete $v->{fallback_status}->{"$real_id"}; if (! %{$v->{fallback_status}}) { $v->{fallback_status} = undef; } } else { delete $v->{real_status}->{"$real_id"}; if (! %{$v->{real_status}}) { $v->{real_status} = undef; } } delete $r->{virtual_status}->{"$virtual_id"}; if (! %{$r->{virtual_status}}) { $r->{virtual_status} = undef; } return 1; } # _service_up # Bring a real service up if it is down # Should be called by service_set only # I.e. If you want to change the state of a real server call service_set. # If you call this function directly then ldirectord will lose track # of the state of real servers. # pre: v: reference to virtual service to with the real server belongs # r: reference to the real server to take down # post: real service is taken up from the respective virtual service # if it is inactive # return: none sub _service_up { my ($v, $r, $force) = (@_); if ($r->{failcount} > 0) { ld_log("Resetting soft failure count: " . $r->{server} . ":" . $r->{port} . " (" . get_virtual_id_str($v) . ")"); } $r->{failcount} = 0; if (! _status_up($v, $r) and ! defined($force)) { return; } &_restore_service($v, $r->{server} . ":" . $r->{port}, $r->{forw}, $r->{weight}, "real"); &fallback_off($v); } # _service_down # Bring a real service down if it is up # Should be called by service_set only # I.e. if you want to change the state of a real server call service_set. # If you call this function directly then ldirectord will lose track # of the state of real servers. # pre: v: reference to virtual service to with the real server belongs # r: reference to the real server to take down # post: real service is taken down from the respective virtual service # if it is active # return: none sub _service_down { my ($v, $r, $force) = @_; if (!_status_check($v, $r) and !defined($force)) { return; } $r->{failcount}++; if (!defined($force) and _status_check($v, $r) and ($r->{failcount} < $v->{failurecount})) { ld_log("Soft failure real server: " . $r->{server} . ":" . $r->{port} . " (" . get_virtual_id_str($v) . ") failure " . $r->{failcount} . "/" . $v->{failurecount}); return; } _status_down($v, $r); &_remove_service($v, $r->{server} . ":" . $r->{port}, $r->{forw}, "real"); &fallback_on($v); } # fallback_on # Turn on the fallback server for a virtual service if it is inactive # pre: v: virtual to turn fallback service on for # post: fallback server is turned on if it was inactive # return: none sub fallback_on { my ($v, $force) = (@_); my $fallback=&fallback_find($v); if (defined($fallback) and (_status_up($v, $fallback, "fallback") or defined($force))) { &_restore_service($v, $fallback->{server} . ":" . $fallback->{port}, get_forward_flag($fallback->{forward}), "1", "fallback"); } if (!defined ($v->{real_status})) { &do_fallback_command($v, "start"); } } # fallback_off # Turn off the fallback server for a virtual service if it is active # pre: v: virtual to turn fallback service off for # post: fallback server is turned off if it was active # return: none sub fallback_off { my ($v, $force) = (@_); my $fallback=&fallback_find($v); if (defined($fallback) and (_status_down($v, $fallback, "fallback") or defined($force))) { &_remove_service($v, $fallback->{server} . ":" . $fallback->{port}, get_forward_flag($fallback->{forward}), "fallback"); } if (defined ($v->{real_status})) { &do_fallback_command($v, "stop"); } } # fallback_find # Determine the fallback for a virtual service # pre: virtual: reference to a virtual service # post: none # return: $virtual->{"fallback"} if defined # else $FALLBACK->{$virtual->{"protocol"}} if defined # else undef sub fallback_find { my ($virtual) = (@_); my($global_fallback_ptr); # fallback pointer my $ipv6p = ($virtual->{addressfamily} == AF_INET6) ? 1 : 0; if( defined $virtual->{"fallback"} ) { return($virtual->{"fallback"}); } elsif ( not defined($FALLBACK) and not $ipv6p ) { return undef; } elsif ( not defined($FALLBACK6) and $ipv6p ) { return undef; } if ($ipv6p) { # IPv6 $global_fallback_ptr = $FALLBACK6; } else { $global_fallback_ptr = $FALLBACK; } # If the global fallback has a port, it can be used as is if (defined($global_fallback_ptr->{$virtual->{"protocol"}}->{"port"})) { return $global_fallback_ptr->{$virtual->{"protocol"}}; } # Else create an anonymous fallback my %anon_fallback = %{$global_fallback_ptr->{$virtual->{"protocol"}}}; $anon_fallback{"port"} = $virtual->{"port"}; return \%anon_fallback; } # fallback_command # Execute the fallback command with the given status if it wasn't executed # with this status already for the supplied virtual service. sub do_fallback_command { my ($v, $status) = (@_); if (defined $v->{fallbackcommand_status} and $v->{fallbackcommand_status} eq $status) { return; } $v->{fallbackcommand_status} = $status; if (defined($v->{fallbackcommand})) { &system_wrapper($v->{fallbackcommand} . " " . $status . " " . $v->{server} . ":" . $v->{port} . " " . $v->{protocol}); } elsif (defined($FALLBACKCOMMAND)) { &system_wrapper($FALLBACKCOMMAND . " " . $status . " " . $v->{server} . ":" . $v->{port} . " " . $v->{protocol}); } } # Used during stop, start and reload to remove stale real servers from LVS sub purge_untracked_service { my ($v, $rservice, $tag) = (@_); my $log_arg = "Purged real server ($tag): $rservice (" . &get_virtual($v) . ")"; &system_wrapper("$IPVSADM -d $v->{proto} " . &get_virtual_option($v) . " -r $rservice"); &ld_log($log_arg); &ld_emailalert_send($log_arg, $v, $rservice, 0); } # Used during stop, start and reload to remove stale real servers from LVS sub purge_service { my ($v, $r, $tag) = (@_); purge_untracked_service($v, "$r->{server}:$r->{port}", $tag); _status_down($v, $r); } # Used during stop, start and reload to remove stale virtual services from LVS sub purge_virtual { my ($v, $tag) = (@_); &system_wrapper("$IPVSADM -D $v->{proto} " . &get_virtual_option($v)); &ld_log("Purged virtual server ($tag): " . &get_virtual($v)); } sub check_cfgfile { my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime) = stat($CONFIG); my ($status); return if ($stattime==$mtime); $stattime = $mtime; use Digest::MD5 qw(md5 md5_hex); my $ctx = Digest::MD5->new; unless (open(CFGFILE, "<$CONFIG")) { &config_warn(0, "can not open file $CONFIG for checking"); return 0; } $ctx->addfile(*CFGFILE); close(CFGFILE); my $digest = $ctx->hexdigest; if (defined $checksum && $checksum ne $digest) { &ld_log("Configuration file '$CONFIG' has changed on disk"); if ($AUTOCHECK eq "yes") { &ld_log(" - reread new configuration"); &reread_config(); } else { &ld_log(" - ignore new configuration\n"); } if (defined($CALLBACK) and -x $CALLBACK) { &system_wrapper("$CALLBACK $CONFIG"); } $status = 1; } $checksum = $digest; return $status; } # ld_openlog # Open logger # make log rotation work # pre: none # post: If logger is a file, it opened and closed again as a test # If logger is syslog, it is opened so it can be used without # needing to be opened again. # Otherwise, nothing is done. # return: 0 on success # 1 on error sub ld_openlog { if ($opt_d or $SUPERVISED eq "yes") { # Instantly do nothing return(0); } if( $LDIRLOG =~ /^\/(.*)/ ) { # Open and close the file as a test. # We open the file each time we want to log to it unless (open(LOGFILE, ">>$LDIRLOG") and close(LOGFILE)) { return 1; } } else { # Assume LDIRLOG is a logfacility, log to syslog setlogsock( "unix" ); openlog( "ldirectord", "pid", "$LDIRLOG" ); } return(0); } # ld_log # Log a message. # pre: message: Message to write # post: message and timetsamp is written to loged # If logger is a file, it is opened and closed again as a # primitive means to make log rotation work # return: 0 on success # 1 on error sub ld_log { my ($message) = (@_); my $now = localtime(); &ld_debug(2, $message); chomp $message; if ($opt_d) { print STDERR "$message\n"; } elsif ($SUPERVISED eq "yes") { print "[$now] $message\n"; } elsif ( $LDIRLOG =~ /^\/(.*)/ ) { unless (open(LOGFILE, ">>$LDIRLOG") and print LOGFILE "[$now|$CFGNAME|$$] $message\n" and close(LOGFILE)) { print STDERR "$message\n"; return 1; } } else { # Assume LDIRLOG is a logfacility, log to syslog syslog( "info", "$message" ); } return(0); } sub daemon_status_str { if ($DAEMON_STATUS == $DAEMON_STATUS_STARTING) { return "starting"; } elsif ($DAEMON_STATUS == $DAEMON_STATUS_RUNNING) { return "running"; } elsif ($DAEMON_STATUS == $DAEMON_STATUS_STOPPING) { return "stopping"; } elsif ($DAEMON_STATUS == $DAEMON_STATUS_RELOADING) { return "reloading"; } return "UNKNOWN"; } # ld_emailalert_send # Send email alerts per virtual server # pre: message: Message to email # post: message is emailed if emailalert defined for virtualserver # return: 0 on success # 1 on error sub ld_emailalert_send { my ($subject, $v, $rserver, $currenttime) = (@_); my $status = 0; my $to_addr; my $frequency; my $virtual_str; my $id; my $statusfilter; my $smtp_server; + my $virtual_info; $frequency = defined $v->{emailalertfreq} ? $v->{emailalertfreq} : $EMAILALERTFREQ; $virtual_str = &get_virtual($v); $id = "$rserver ($virtual_str)"; if ($currenttime == 0 or $frequency == 0) { delete $EMAILSTATUS{"$id"}; } else { $EMAILSTATUS{$id}->{v} = $v; $EMAILSTATUS{$id}->{alerttime} = $currenttime; } $statusfilter = defined $v->{emailalertstatus} ? $v->{emailalertstatus} : $EMAILALERTSTATUS; if (($DAEMON_STATUS & $statusfilter) == 0) { return 0; } $to_addr = defined $v->{emailalert} ? $v->{emailalert} : $EMAILALERT; if ($to_addr eq "") { return 0; } $smtp_server = defined $v->{smtp} ? $v->{smtp} : $SMTP; &ld_log("emailalert: $subject"); + + # get extra service details + $virtual_info = _ld_virtual_server_details($v); + + # add service name into e-mail subject if it has been set + if ($v->{servicename}) { + $subject = "[" . $v->{servicename} ."] $subject"; + } + if (defined $smtp_server) { - $status = &ld_emailalert_net_smtp($smtp_server, $to_addr, $subject); + $status = &ld_emailalert_net_smtp($smtp_server, $to_addr, $subject, $virtual_info); } else { - $status = &ld_emailalert_mail_send($to_addr, $subject); + $status = &ld_emailalert_mail_send($to_addr, $subject, $virtual_info); } return($status); } +# generate virtual server information to go in to alert e-mails +sub _ld_virtual_server_details +{ + my ($v) = @_; + my $details; + + if ($v->{servicename}) { + $details .= "Service name: " . $v->{servicename} . "\n" + } + + if ($v->{comment}) { + $details .= "Comment: " . $v->{comment} . "\n"; + } + + return $details; +} + + # ld_emailalert_net_smtp # Send email alerts via SMTP server # pre: smtp: SMTP server defined # post: message is emailed if SMTP server is valid and working # return: 0 on success # 1 on error sub ld_emailalert_net_smtp { - my ($smtp_server, $to_addr, $subject) = (@_); + my ($smtp_server, $to_addr, $subject, $extrabody) = (@_); my $status = 0; use Net::SMTP; use Sys::Hostname; my $hostname = hostname; my $smtp = Net::SMTP->new($smtp_server); if ($smtp) { my $myusername = getpwuid( $< ); $smtp->mail("$myusername\@$hostname"); $smtp->to($to_addr); $smtp->data(); if($EMAILALERTFROM) { $smtp->datasend("From: $EMAILALERTFROM\n"); } else { $smtp->datasend("From: $myusername\@$hostname\n"); } $smtp->datasend("To: $to_addr\n"); $smtp->datasend("Subject: $subject\n\n"); $smtp->datasend("ldirectord host: $hostname\n" . "Log-Message: $subject\n" . "Daemon-Status: " . &daemon_status_str() . "\n"); + $smtp->datasend("\n$extrabody\n") if ($extrabody); $smtp->dataend(); $smtp->quit; } else { &ld_log("failed to send SMTP email message\n"); $status = 1; } return($status); } # ld_emailalert_mail_send # Send email alerts via Mail::Send # pre: smtp: SMTP server not defined # post: message is emailed if one of the Mail::Send methods works # return: 0 on success # 1 on error sub ld_emailalert_mail_send { - my ($to_addr, $subject) = (@_); + my ($to_addr, $subject, $extrabody) = (@_); my $emailmsg; my $emailfh; my $status = 0; use Mail::Send; $emailmsg = new Mail::Send Subject=>$subject, To=>$to_addr; $emailmsg->set('From', $EMAILALERTFROM) if ($EMAILALERTFROM); $emailfh = $emailmsg->open; print $emailfh "ldirectord host: " . hostname() . "\n" . "Log-Message: $subject\n" . "Daemon-Status: " . &daemon_status_str() . "\n"; + print $emailfh "\n$extrabody\n" if ($extrabody); unless ($emailfh->close) { &ld_log("failed to send email message\n"); $status = 1; } return($status); } # ld_emailalert_resend # Resend email alerts as necessary # pre: none # post: EMAILSTATUS array is updated and alerts are sent as necessary # return: none sub ld_emailalert_resend { my $currenttime = time(); my $es; my $id; my $rserver; my $frequency; foreach $id (keys %EMAILSTATUS) { $es = $EMAILSTATUS{$id}; $frequency = defined $es->{v}->{emailalertfreq} ? $es->{v}->{emailalertfreq} : $EMAILALERTFREQ; $id =~ m/(.*) /; $rserver = $1; if ($currenttime - $es->{alerttime} < $frequency) { next; } &ld_emailalert_send("Inaccessible real server: $id", $es->{v}, $rserver, $currenttime); } } # ld_debug # Log a message to a STDOUT. # pre: priority: priority of message # message: Message to write # post: message is written to STDOUT if $DEBUG >= priority # return: none sub ld_debug { my ($priority, $message) = (@_); if ( $DEBUG >= $priority ) { chomp $message; print STDERR "DEBUG${priority}: $message\n"; } } # system_wrapper # Wrapper around system() to log errors # # WARNING: Do not use alarm() together with this function. A internal # pipe will not be reclaimed (at least with Perl 5.8.8). This can # cause ldirectord to run out of file handles. # # pre: LIST: arguments to pass to system() # post: system() is called and if it returns non-zero a failure # message is logged # return: return value of system() sub system_wrapper { my (@args)=(@_); my $status; &ld_log("Running system(@args)") if $DEBUG>2; $status = system(@args); if($status != 0) { &ld_log("system(@args) failed: $!"); } return($status) } # system_timeout # Emulate system() with timeout via fork(), exec(), and waitpid() and # TERMinate the child on timeout. Set an alarm() for the timeout. # # This function does not suffer the deficiencies of system_wrapper() # of leaving pipes unreclaimed. Zombies are reaped by ld_handler_chld # and the related code. # # pre: timeout: timeout in seconds # LIST: arguments to pass to exec() # return: >= 0 exit status of the child process # 127 exec failed # -1 timeout # -2 fork failed sub system_timeout { my $timeout = shift; my (@args) = (@_); my $status; &ld_log("Running system_timeout($timeout, @args)") if $DEBUG>2; my $childpid = fork(); if (!defined($childpid)) { &ld_log("fork failed: $!"); return(-2); } elsif ($childpid) { # parent eval { local $SIG{'ALRM'} = sub { die "timeout\n"; }; alarm $timeout; waitpid($childpid, 0); $status = $? >> 8; # When die()-ing in the SIGALRM handler we # will never reach this point. Child/Zombie # is left behind. The grim reaper # (ld_handler_chld + ld_process_chld) will # take care of the zombie. }; alarm 0; if ($@) { # timeout if ($@ ne "timeout\n") { # log unexpected errors &ld_log("system_timeout($timeout, @args) " . "unexpected error: $@"); } else { &ld_log("system_timeout($timeout, @args) " . "timed out, kill -TERM child"); } # TERMinate child kill 15, $childpid; return(-1); } else { # did not timeout return($status); } } else { # child exec(@args) or &ld_exit(127, "exec(@args) failed: $!"); die "ld_exit() broken?, stopped"; } } # exec_wrapper # Wrapper around exec() to log errors # pre: LIST: arguments to pass to exec() # post: exec() is called and if it returns non-zero a failure # message is logged # return: return value of exec() on failure # does not return on success sub exec_wrapper { my (@args)=(@_); my $status; &ld_log("Running exec(@args)") if $DEBUG>2; $status = exec(@args) or &ld_log("exec(@args) failed"); return($status) } # ld_rm_file # Remove a file, symink, or anything that isn't a directory # and exists # pre: filename: file to delete # post: If filename does not exist or is a directory an # error state is reached # Else filename is delete # If $DEBUG >=2 errors are logged # return: 0 on success # -1 on error sub ld_rm_file { my ($filename)=(@_); my ($status); if(-d "$filename"){ &ld_debug(2, "ld_rm_file: $filename is a directory, skipping"); return(-1); } if(! -e "$filename"){ &ld_debug(2, "ld_rm_file: $filename doesn't exist, skipping"); return(-1); } $status = unlink($filename); if($status!=1){ &ld_debug(2, "ld_rm_file: Error deleting: $filename: $!"); } return(($status==1)?0:-1) } # is_octet # See if a number is an octet, that is >=0 and <=255 # pre: alleged_octet: the octet to test # post: alleged_octet is checked to see if it is valid # return: 1 if the alleged_octet is an octet # 0 otherwise sub is_octet { my ($alleged_octet)=(@_); if($alleged_octet<0){ return 0; } if($alleged_octet>255){ return 0; } return(1); } # is_ip # Check that a given string is an IP address # pre: alleged_ip: string representing ip address # post: alleged_ip is checked to see if it is valid # return: 1 if alleged_ip is a valid ip address # 0 otherwise sub is_ip { my ($alleged_ip)=(@_); if ($alleged_ip =~ /:/) { unless(inet_pton(AF_INET6,$alleged_ip)){ return 0; } return(1); } #If we don't have four, . delimited numbers then we have no hope unless($alleged_ip=~m/^(\d+)\.(\d+)\.(\d+)\.(\d+)$/) { return 0; } #Each octet mist be >=0 and <=255 unless(&is_octet($1)){ return 0; } unless(&is_octet($2)){ return 0; } unless(&is_octet($3)){ return 0; } unless(&is_octet($4)){ return 0; } return(1); } # ip_to_int # Turn an IP address given as a dotted quad into an integer # pre: ip_address: string representing IP address # post: post ip_address is converted to an integer # return: -1 if an error occurs # integer representation of IP address otherwise sub ip_to_int { my ($ip_address)=(@_); unless(&is_ip($ip_address)){ return(-1); } unless($ip_address=~m/^(\d+)\.(\d+)\.(\d+)\.(\d+)$/){ return(-1); } return(((((($1<<8)+$2)<<8)+$3)<<8)+$4); } # int_to_ip # Turn an IP address given as a dotted quad into an integer # pre: ip_address: string representing IP address # post: Decimal is converted to a dotted quad # return: -1 if an error occurs # integer representation of IP address otherwise sub int_to_ip { my ($ip_address)=(@_); my $result = ""; return(sprintf( "%d.%d.%d.%d", ($ip_address>>24)&255, ($ip_address>>16)&255, ($ip_address>>8)&255, $ip_address&255 )); } # get_virtual # Get the service for a virtual # pre: nv: virtual to get the service for # post: none # return: fwmark of service if it is a fwm service # ip_address:port otherwise sub get_virtual { my ($nv) = (@_); if ($nv->{"protocol"} eq "fwm"){ return $nv->{"fwm"}; } else { return $nv->{"server"} . ":" . $nv->{"port"}; } } # get_virtual_option # Get the ipvsadm option corresponding to a virtual service # pre: nv: virtual to get the service for # post: none # return: fwmark of service if it is a fwm service # fwmark of service + "-6" if it is a fwm service and the address family is AF_INET6 # ip_address:port otherwise sub get_virtual_option { my ($nv) = (@_); my ($cmdline) = &get_virtual($nv); if ($nv->{"protocol"} eq "fwm" && $nv->{addressfamily} == AF_INET6) { $cmdline .= " -6"; } return $cmdline; } # get_real_id_str # Get an id string for a real server # pre: r: Real service. # protocol: protocol of the real service # tcp or udp # service: type of service # post: none # return: Id string for the real server sub get_real_id_str { my ($r, $v) = (@_); my $request = ""; my $receive = ""; my $checkport = ""; my $virtualhost = ""; my $check; my $real; if(defined($r->{"request"})) { $request = $r->{"request"}; } else { $request = $v->{"request"}; } if(defined($r->{"receive"})) { $receive = $r->{"receive"}; } else { $receive = $v->{"receive"}; } if($v->{"checktype"} eq "negotiate" or $v->{"checktype"} eq "combined") { $check = $v->{"checktype"} . ":" . $v->{"service"}; } elsif($v->{"checktype"} eq "external" or $v->{"checktype"} eq "external-perl") { $check = $v->{"checktype"} . ":" . $v->{"checkcommand"}; } else { $check = $v->{"checktype"}; } if(defined($v->{"checkport"})) { $checkport = $v->{"checkport"}; } if(defined($v->{"virtualhost"})) { $virtualhost = $v->{"virtualhost"}; } $real = $check . ":" . $v->{"protocol"} . ":" . $r->{"server"} . ":" . $r->{"port"} . ":" . $virtualhost . ":" . $checkport . ":" . $r->{"weight"} . ":" . $r->{"forward"} . ":" . quotemeta($request) . ":" . quotemeta($receive); } # get_virtual_id_str # Get an id string for a virtual service # pre: v: Virtual service # post: none # return: Id string for the virtual service sub get_virtual_id_str { my ($v) = (@_); if ($v->{"protocol"} eq "fwm") { return $v->{"protocol"} . (($v->{addressfamily} == AF_INET6)?"6":"") . ":" . &get_virtual($v); } else { return $v->{"protocol"} . ":" . &get_virtual($v); } } # get_forward_flag # Get the ipvsadm flag corresponding to a forwarding mechanism # pre: forward: Name of forwarding mechanism. u # Should be one of ipip, masq or gate # post: none # return: ipvsadm flag corresponding to the forwarding mechanism # " " if $forward is unknown sub get_forward_flag { my ($forward) = (@_); unless(defined($forward)) { return(" "); } if ($forward eq "masq") { return("-m"); } elsif ($forward eq "gate") { return("-g"); } elsif ($forward eq "ipip") { return("-i"); } return(" "); } # ld_exit # Exit and log a message # pre: exit_status: Integer exit status to exit with # 0 will be used if parameter is omitted # message: Message to log when exiting. May be omitted # post: If exit_status is non-zero or $DEBUG>2 then # message logged. # Programme exits with exit_status # return: does not return sub ld_exit { my ($exit_status, $message)=(@_); unless(defined($exit_status)) { $exit_status=0; } unless(defined($message)) { $message=""; } if ($exit_status!=0 or $DEBUG>2) { &ld_log("Exiting with exit_status $exit_status: $message"); } exit($exit_status); } # ld_open_socket # Open a socket connection # pre: remote: IP address as a dotted quad of remote host to connect to # port: port to connect to # protocol: Protocol to use. Should be either "tcp" or "udp" # post: A Socket connection is opened to the remote host # return: Open socket # undef on error sub ld_open_socket { my ($remote, $port, $protocol) = @_; my ($iaddr, $paddr, $pro, $result, $pf); local *SOCK; $remote = &ld_strip_brackets($remote); if (inet_pton(AF_INET6,$remote)) { $iaddr = inet_pton(AF_INET6,$remote); $paddr = pack_sockaddr_in6($port, $iaddr); $pf = PF_INET6; } else { $iaddr = inet_aton($remote) || die "no host: $remote"; $paddr = sockaddr_in($port, $iaddr); $pf = PF_INET; } $pro = getprotobyname($protocol); if ($protocol eq "udp") { socket(SOCK, $pf, SOCK_DGRAM, $pro) || die "socket: $!"; } else { socket(SOCK, $pf, SOCK_STREAM, $pro) || die "socket: $!"; } $result = connect(SOCK, $paddr); unless ($result) { return undef; } return *SOCK; } # daemon # Close and fork to become a daemon. # # Notes from unix programmer faq # http://www.landfield.com/faqs/unix-faq/programmer/faq/ # # Almost none of this is necessary (or advisable) if your daemon is being # started by `inetd'. In that case, stdin, stdout and stderr are all set up # for you to refer to the network connection, and the `fork()'s and session # manipulation should *not* be done (to avoid confusing `inetd'). Only the # `chdir()' step remains useful. # # Gratuitously over documented, because it can be # # Written by Horms, horms@verge.net.au for an unrelated project while # working for Zip World, http://www.zipworld.com.au/, 1997-1999. sub ld_daemon { # `fork()' so the parent can exit, this returns control to the command # line or shell invoking your program. This step is required so that # the new process is guaranteed not to be a process group leader. The # next step, `setsid()', fails if you're a process group leader. &ld_daemon_become_child(); # setsid()' to become a process group and session group leader. Since a # controlling terminal is associated with a session, and this new # session has not yet acquired a controlling terminal our process now # has no controlling terminal, which is a Good Thing for daemons. if(POSIX::setsid()<0){ &ld_exit(1, "ld_daemon: Could not setsid"); } # fork()' again so the parent, (the session group leader), can exit. # This means that we, as a non-session group leader, can never regain a # controlling terminal. &ld_daemon_become_child(); # `chdir("/")' to ensure that our process doesn't keep any directory in # use. Failure to do this could make it so that an administrator # couldn't unmount a filesystem, because it was our current directory. if(chdir("/")<0){ &ld_exit(1, "ld_daemon: Could not chdir"); } # `close()' fds 0, 1, and 2. This releases the standard in, out, and # error we inherited from our parent process. We have no way of knowing # where these fds might have been redirected to. Note that many daemons # use `sysconf()' to determine the limit `_SC_OPEN_MAX'. `_SC_OPEN_MAX' # tells you the maximum open files/process. Then in a loop, the daemon # can close all possible file descriptors. You have to decide if you # need to do this or not. If you think that there might be # file-descriptors open you should close them, since there's a limit on # number of concurrent file descriptors. close(STDIN); close(STDOUT); close(STDERR); # Establish new open descriptors for stdin, stdout and stderr. Even if # you don't plan to use them, it is still a good idea to have them open. # The precise handling of these is a matter of taste; if you have a # logfile, for example, you might wish to open it as stdout or stderr, # and open `/dev/null' as stdin; alternatively, you could open # `/dev/console' as stderr and/or stdout, and `/dev/null' as stdin, or # any other combination that makes sense for your particular daemon. # # This code used to open /dev/console for STDOUT and STDERR, # but that was changed to /dev/null to stop the code hanging in # the case where /dev/console is unavailable for some reason # http://www.osdl.org/developer_bugzilla/show_bug.cgi?id=1180 if(open(STDIN, ">/dev/null")<0){ &ld_exit(-1, "ld_daemon: Could not open /dev/null"); } if(open(STDERR, ">>/dev/null")<0){ &ld_exit(-1, "ld_daemon: Could not open /dev/null"); } } # ld_daemon_become_child # Fork, kill parent and return child process # pre: none # post: process forks and parent exits # All process exit with exit status -1 if an error occurs # return: parent: exits # child: none (this is the process that returns) # Written by Horms, horms@verge.net.au for an unrelated project while # working for Zip World, http://www.zipworld.com.au/, 1997-1999. sub ld_daemon_become_child { my($status); $status = fork(); if ($status<0){ &ld_exit(-1, "ld_daemon_become_child: Could not fork: $!"); } if ($status>0){ &ld_exit(0, "ld_daemon_become_child: Parent exiting as it should"); } } # ld_gethostbyname # Wrapper to gethostbyname. Look up the/an IP address of a hostname # If an IP address is given is it returned # pre: name: Hostname of IP address to lookup # af: Address Family: AF_INET etc.. # post: gethostbyname is called to find an IP address for $name # This is converted to a string # return: IP address # undef on error sub ld_gethostbyname { my ($name, $af)=(@_); if ($name =~ /\[(.*)\]/) { $name = $1; } my @host = getaddrinfo($name, 0, $af); if (!defined($host[3])) { return undef; } my @ret = getnameinfo($host[3], NI_NUMERICHOST | NI_NUMERICSERV); if ($host[0] == AF_INET6) { return "[$ret[0]]"; } else { return $ret[0]; } } # ld_gethostbyaddr # Wrapper to gethostbyaddr. Look up the hostname from an IP address. # If no reverse DNS record is found, return undef # pre: ip: IP address of host to lookup # post: gethostbyaddr is called to find a hostname for IP $ip # return: hostname # undef on error sub ld_gethostbyaddr { my ($ip)=(@_); $ip = &ld_strip_brackets($ip); my @host = getaddrinfo($ip,0); if (!defined($host[3])) { return undef; } my @ret = getnameinfo($host[3], NI_NAMEREQD); return undef unless(scalar(@ret) == 2); return $ret[0]; } # ld_getservbyname # Wrapper for getservbyname. Look up the port for a service name # If a port is given it is returned. # pre: name: Port or Service name to look up # post: if $name is a number # if 0<=$name<=65536 $name is returned # else undef is returned # else getservbyname is called to look up the port for the service # return: Port # undef on error sub ld_getservbyname { my ($name, $protocol)=(@_); if($name=~/^[0-9]+$/){ return(($name>=0 and $name<65536)?$name:undef); } my @serv=getservbyname($name, $protocol); return((@serv and defined($serv[2]))?$serv[2]:undef); } # ld_getservhostbyname # Wrapper for ld_gethostbyname and ld_getservbyname. Given a server of the # form ip_address|hostname[:port|servicename] return ip_address[:port] # pre: hostserv: Servver of the form ip_address|hostname[:port|servicename] # protocol: Protocol for service. Should be either "tcp" or "udp" # af: Address Family: AF_INET etc.. # post: lookups performed as per ld_getservbyname and ld_gethostbyname # return: ip_address[:port] # undef on error sub ld_gethostservbyname{ my ($hostserv, $protocol, $af) = (@_); my $ip; my $port; if ($hostserv =~ /(:(\d+|[A-Za-z0-9-_]+))?$/) { $port = $2; $ip = $hostserv; $ip =~ s/(:(\d+|[A-Za-z0-9-_]+))?$//; } else { $ip = $hostserv; } $ip=&ld_gethostbyname($ip, $af) or return(undef); if(defined($port)){ $port=&ld_getservbyname($port, $protocol); if (defined($port)) { return("$ip:$port"); } else { return(undef); } } return($ip); } # ld_find_cmd_path # Find executable in path # pre: cmd: command to find # path: ':' delimited paths to check # relative: if set, allow cmd to be a relative path, # which is checked first # return: path to command # undef if not found sub ld_find_cmd_path { my ($cmd, $path, $relative) = (@_); if (defined $relative and $relative and -f "$cmd" ) { return $cmd; } if ($cmd =~ /^\// and -x "$cmd" ) { return $cmd; } if ($cmd =~ /\//) { return undef; } for my $p (split /:/, $path) { if ( -x "$p/$cmd" ) { return "$p/$cmd"; } } return undef; } # ld_find_cmd_path # Find executable in $ENV{'PATH'} # pre: cmd: command to find # relative: if set, allow cmd to be a relative path, # which is checked first # return: path to command # undef if not found sub ld_find_cmd { return ld_find_cmd_path($_[0], $ENV{'PATH'}, $_[1]); } # ld_get_addrport # Get address string and port number from a given socket. # pre: socket # return: (address, port) # undef if cannot get sub ld_get_addrport { my($sock) = @_; my ($s_addr_str, $s_port, $s_addr, $len); my $s_sockaddr = getsockname($sock); $len = length($s_sockaddr); if ($len == 28) { # IPv6 ($s_port, $s_addr) = unpack_sockaddr_in6($s_sockaddr); $s_addr_str = inet_ntop(AF_INET6, $s_addr); $s_addr_str = "[$s_addr_str]"; } elsif ($len == 16) { # IPv4 ($s_port, $s_addr) = unpack_sockaddr_in($s_sockaddr); $s_addr_str = inet_ntop(AF_INET, $s_addr); } else { die "unexpected length of sockaddr\n"; } return ($s_addr_str, $s_port); } # ld_strip_brackets # Strip brackets in the string # pre: string # return: string sub ld_strip_brackets { my($str) = @_; $str =~ s/[\[\]]//g; return $str; } diff --git a/rgmanager/src/resources/clusterfs.sh b/rgmanager/src/resources/clusterfs.sh index 07fd73b10..ab2c292d5 100755 --- a/rgmanager/src/resources/clusterfs.sh +++ b/rgmanager/src/resources/clusterfs.sh @@ -1,341 +1,342 @@ #!/bin/bash # # Cluster File System mount/umount/fsck/etc. agent # # Copyright (C) 2000 Mission Critical Linux # Copyright (C) 2002-2011 Red Hat, Inc. All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # . $(dirname $0)/utils/fs-lib.sh do_metadata() { cat < 1.0 This defines a cluster file system mount (i.e. GFS) Defines a cluster file system mount. Symbolic name for this file system. File System Name Path in file system heirarchy to mount this file system. Mount Point Block device, file system label, or UUID of file system. Device or Label File system type. If not specified, mount(8) will attempt to determine the file system type. File system type If set, the cluster will kill all processes using this file system when the resource group is stopped. Otherwise, the unmount will fail, and the resource group will be restarted. Force Unmount If set and unmounting the file system fails, the node will immediately reboot. Generally, this is used in conjunction with force_unmount support, but it is not required. Seppuku Unmount File system ID for NFS exports. This can be overridden in individual nfsclient entries. NFS File system ID If set, the node will try to kill lockd and issue reclaims across all remaining network interface cards. This happens always, regardless of unmounting failed. Enable NFS lock workarounds If set and unmounting the file system fails, the node will try to restart nfs daemon and nfs lockd to drop all filesystem references. Use this option as last resource. This option requires force_unmount to be set and it is not compatible with nfsserver resource. Enable NFS daemon and lockd workaround Options used when the file system is mounted. These are often file-system specific. See mount(8) and/or mount.gfs2(8) for supported mount options. Mount Options Use findmnt to determine if and where a filesystem is mounted. Disabling this uses the failback method (should be used if autofs maps are located on network storage (ie. nfs, iscsi, etc). Utilize findmnt to detect if and where filesystems are mounted EOT } verify_fstype() { # Auto detect? [ -z "$OCF_RESKEY_fstype" ] && return $OCF_SUCCESS case $OCF_RESKEY_fstype in gfs|gfs2) return $OCF_SUCCESS ;; *) ocf_log err "File system type $OCF_RESKEY_fstype not supported" return $OCF_ERR_ARGS ;; esac } verify_options() { declare -i ret=$OCF_SUCCESS # # From mount(8) # for o in `echo $OCF_RESKEY_options | sed -e s/,/\ /g`; do case $o in async|atime|auto|defaults|dev|exec|_netdev|noatime) continue ;; noauto|nodev|noexec|nosuid|nouser|ro|rw|suid|sync) continue ;; dirsync|user|users) continue ;; esac case $OCF_RESKEY_fstype in gfs) case $o in lockproto=*|locktable=*|hostdata=*) continue; ;; localcaching|localflocks|ignore_local_fs) continue; ;; num_glockd|acl|suiddir) continue ;; esac ;; gfs2) # XXX continue ;; esac ocf_log err "Option $o not supported for $OCF_RESKEY_fstype" ret=$OCF_ERR_ARGS done return $ret } do_verify() { verify_name || return $OCF_ERR_ARGS verify_fstype || return $OCF_ERR_ARGS verify_device || return $OCF_ERR_ARGS verify_mountpoint || return $OCF_ERR_ARGS verify_options || return $OCF_ERR_ARGS } do_pre_unmount() { # # Check the rgmanager-supplied reference count if one exists. # If the reference count is <= 1, we can safely proceed # if [ -n "$OCF_RESKEY_RGMANAGER_meta_refcnt" ]; then refs=$OCF_RESKEY_RGMANAGER_meta_refcnt if [ $refs -gt 0 ]; then ocf_log debug "Not unmounting $OCF_RESOURCE_INSTANCE - still in use by $refs other service(s)" return 2 fi fi if [ -z "$force_umount" ]; then ocf_log debug "Not umounting $dev (clustered file system)" return 2 fi # # Always do this hackery on clustered file systems. # if [ "$OCF_RESKEY_nfslock" = "yes" ] || \ [ "$OCF_RESKEY_nfslock" = "1" ]; then ocf_log warning "Dropping node-wide NFS locks" mkdir -p $mp/.clumanager/statd + chown rpcuser.rpcuser $mp/.clumanager/statd pkill -KILL -x lockd # Copy out the notify list; our # IPs are already torn down if notify_list_store $mp/.clumanager/statd; then notify_list_broadcast $mp/.clumanager/statd fi fi # Always invalidate buffers on clusterfs resources clubufflush -f $dev return 0 } do_force_unmount() { if [ "$OCF_RESKEY_nfsrestart" = "yes" ] || \ [ "$OCF_RESKEY_nfsrestart" = "1" ]; then ocf_log warning "Restarting nfsd/nfslock" nfsexports=$(cat /var/lib/nfs/etab) service nfslock stop service nfs stop service nfs start service nfslock start echo "$nfsexports" | { while read line; do nfsexp=$(echo $line | awk '{print $1}') nfsopts=$(echo $line | sed -e 's#.*(##g' -e 's#).*##g') nfsacl=$(echo $line | awk '{print $2}' | sed -e 's#(.*##g') if [ -n "$nfsopts" ]; then exportfs -i -o "$nfsopts" "$nfsacl":$nfsexp else exportfs -i "$nfsacl":$nfsexp fi done; } fi return 1 } main $* diff --git a/rgmanager/src/resources/fs.sh.in b/rgmanager/src/resources/fs.sh.in index 2924fa7d9..6d99f9561 100644 --- a/rgmanager/src/resources/fs.sh.in +++ b/rgmanager/src/resources/fs.sh.in @@ -1,502 +1,504 @@ #!/bin/bash # # File system (normal) mount/umount/fsck/etc. agent # # # Copyright (C) 1997-2003 Sistina Software, Inc. All rights reserved. # Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # . $(dirname $0)/utils/fs-lib.sh do_metadata() { cat < 1.0 This defines a standard file system mount (= not a clustered or otherwise shared file system). Defines a file system mount. Symbolic name for this file system. File System Name Path in file system heirarchy to mount this file system. Mount Point Block device, file system label, or UUID of file system. Device or Label File system type. If not specified, mount(8) will attempt to determine the file system type. File system type If set, the cluster will kill all processes using this file system when the resource group is stopped. Otherwise, the unmount will fail, and the resource group will be restarted. Force Unmount Use quick status checks. When set to 0 (the default), this agent behaves normally. When set to 1, this agent will not log errors incurred or perform the file system accessibility check (e.g. it will not try to read from/write to the file system). You should only set this to 1 if you have lots of file systems on your cluster or you are seeing very high load spikes as a direct result of this agent. Quick/brief status checks. If set and unmounting the file system fails, the node will immediately reboot. Generally, this is used in conjunction with force_unmount support, but it is not required. Seppuku Unmount If set and unmounting the file system fails, the node will try to kill lockd and issue reclaims across all remaining network interface cards. Enable NFS lock workarounds If set and unmounting the file system fails, the node will try to restart nfs daemon and nfs lockd to drop all filesystem references. Use this option as last resource. This option requires force_unmount to be set and it is not compatible with nfsserver resource. Enable NFS daemon and lockd workaround File system ID for NFS exports. This can be overridden in individual nfsclient entries. NFS File system ID If set, the file system will be checked (even if it is a journalled file system). This option is ignored for non-journalled file systems such as ext2. Force fsck support Options used when the file system is mounted. These are often file-system specific. See mount(8) for supported mount options. Mount Options Use findmnt to determine if and where a filesystem is mounted. Disabling this uses the failback method (should be used if autofs maps are located on network storage (ie. nfs, iscsi, etc). Utilize findmnt to detect if and where filesystems are mounted EOT } verify_fstype() { # Auto detect? [ -z "$OCF_RESKEY_fstype" ] && return 0 case $OCF_RESKEY_fstype in ext2|ext3|ext4|btrfs|jfs|xfs|reiserfs|vfat|vxfs) return 0 ;; *) echo "File system type $OCF_RESKEY_fstype not supported" return $OCF_ERR_ARGS ;; esac } verify_options() { declare -i ret=$OCF_SUCCESS declare o # # From mount(8) # for o in `echo $OCF_RESKEY_options | sed -e s/,/\ /g`; do case $o in async|atime|auto|defaults|dev|exec|_netdev|noatime) continue ;; noauto|nodev|noexec|nosuid|nouser|ro|rw|suid|sync) continue ;; dirsync|user|users) continue ;; esac do_verify_option $OCF_RESKEY_fstype "$o" case $OCF_RESKEY_fstype in ext2|ext3|ext4) case $o in bsddf|minixdf|check|check=*|nocheck|debug) continue ;; errors=*|grpid|bsdgroups|nogrpid|sysvgroups) continue ;; resgid=*|resuid=*|sb=*|grpquota|noquota) continue ;; quota|usrquota|nouid32) continue ;; esac if [ "$OCF_RESKEY_fstype" = "ext3" ] || [ "$OCF_RESKEY_fstype" = "ext4" ]; then case $o in noload|data=*) continue ;; esac fi ;; vfat) case $o in blocksize=512|blocksize=1024|blocksize=2048) continue ;; uid=*|gid=*|umask=*|dmask=*|fmask=*) continue ;; check=r*|check=n*|check=s*|codepage=*) continue ;; conv=b*|conv=t*|conv=a*|cvf_format=*) continue ;; cvf_option=*|debug|fat=12|fat=16|fat=32) continue ;; iocharset=*|quiet) continue ;; esac ;; jfs) case $o in conv|hash=rupasov|hash=tea|hash=r5|hash=detect) continue ;; hashed_relocation|no_unhashed_relocation) continue ;; noborder|nolog|notail|resize=*) continue ;; esac ;; xfs) case $o in biosize=*|dmapi|xdsm|logbufs=*|logbsize=*) continue ;; logdev=*|rtdev=*|noalign|noatime) continue ;; norecovery|osyncisdsync|quota|userquota) continue ;; uqnoenforce|grpquota|gqnoenforce) continue ;; sunit=*|swidth=*) continue ;; esac ;; btrfs) # tbd continue ;; esac echo Option $o not supported for $OCF_RESKEY_fstype ret=$OCF_ERR_ARGS done return $ret } do_validate() { verify_name || return $OCF_ERR_ARGS verify_fstype || return $OCF_ERR_ARGS verify_device || return $OCF_ERR_ARGS verify_mountpoint || return $OCF_ERR_ARGS verify_options || return $OCF_ERR_ARGS } do_pre_mount() { declare fstype="$OCF_RESKEY_fstype" # # Check to determine if we need to fsck the filesystem. # # Note: this code should not indicate in any manner suggested # file systems to use in the cluster. Known filesystems are # listed here for correct operation. # case "$fstype" in reiserfs) typeset fsck_needed="" ;; ext3) typeset fsck_needed="" ;; ext4) typeset fsck_needed="" ;; btrfs) typeset fsck_needed="" ;; jfs) typeset fsck_needed="" ;; xfs) typeset fsck_needed="" ;; vxfs) typeset fsck_needed="" ;; ext2) typeset fsck_needed=yes ;; minix) typeset fsck_needed=yes ;; vfat) typeset fsck_needed=yes ;; msdos) typeset fsck_needed=yes ;; "") typeset fsck_needed=yes ;; # assume fsck *) typeset fsck_needed=yes # assume fsck ocf_log warn "\ Unknown file system type '$fstype' for device $dev. Assuming fsck is required." ;; esac # # Fsck the device, if needed. # if [ -n "$fsck_needed" ] || [ "${OCF_RESKEY_force_fsck}" = "yes" ] ||\ [ "${OCF_RESKEY_force_fsck}" = "1" ]; then typeset fsck_log=@LOGDIR@/$(basename $dev).fsck.log ocf_log debug "Running fsck on $dev" fsck -p $dev >> $fsck_log 2>&1 ret_val=$? if [ $ret_val -gt 1 ]; then ocf_log err "\ 'fsck -p $dev' failed, error=$ret_val; check $fsck_log for errors" ocf_log debug "Invalidating buffers for $dev" $INVALIDATEBUFFERS -f $dev return $OCF_ERR_GENERIC fi rm -f $fsck_log fi return 0 } do_post_mount() { # # Create this for the NFS NLM broadcast bit # if [ $NFS_TRICKS -eq 0 ]; then if [ "$OCF_RESKEY_nfslock" = "yes" ] || \ [ "$OCF_RESKEY_nfslock" = "1" ]; then mkdir -p "$mp"/.clumanager/statd + chown rpcuser.rpcuser "$mp"/.clumanager/statd notify_list_merge "$mp"/.clumanager/statd fi fi return 0 } do_force_unmount() { if [ "$OCF_RESKEY_nfslock" = "yes" ] || \ [ "$OCF_RESKEY_nfslock" = "1" ]; then ocf_log warning "Dropping node-wide NFS locks" pkill -KILL -x lockd mkdir -p "$mp"/.clumanager/statd + chown rpcuser.rpcuser "$mp"/.clumanager/statd # Copy out the notify list; our # IPs are already torn down notify_list_store "$mp"/.clumanager/statd # Save for post-umount phase export nfslock_reclaim=1 fi if [ "$OCF_RESKEY_nfsrestart" = "yes" ] || \ [ "$OCF_RESKEY_nfsrestart" = "1" ]; then ocf_log warning "Restarting nfsd/nfslock" nfsexports=$(cat /var/lib/nfs/etab) service nfslock stop service nfs stop service nfs start service nfslock start echo "$nfsexports" | { while read line; do nfsexp=$(echo $line | awk '{print $1}') nfsopts=$(echo $line | sed -e 's#.*(##g' -e 's#).*##g') nfsacl=$(echo $line | awk '{print $2}' | sed -e 's#(.*##g') if [ -n "$nfsopts" ]; then exportfs -i -o "$nfsopts" "$nfsacl":$nfsexp else exportfs -i "$nfsacl":$nfsexp fi done; } fi # Proceed with fuser -kvm... return 1 } do_post_unmount() { if [ "$nfslock_reclaim" = "1" ]; then # If we have this flag set, do a full reclaim broadcast notify_list_broadcast "$mp"/.clumanager/statd fi return 0 } main $* diff --git a/tools/ocf-tester.in b/tools/ocf-tester.in index ae2c4a9ff..10822a5a0 100755 --- a/tools/ocf-tester.in +++ b/tools/ocf-tester.in @@ -1,432 +1,432 @@ #!/bin/sh # # $Id: ocf-tester,v 1.2 2006/08/14 09:38:20 andrew Exp $ # # Copyright (c) 2006 Novell Inc, Andrew Beekhof # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # LRMD=@libdir@/heartbeat/lrmd LRMADMIN=@sbindir@/lrmadmin DATADIR=@datadir@ METADATA_LINT="xmllint --noout --valid -" # set some common meta attributes, which are expected to be # present by resource agents export OCF_RESKEY_CRM_meta_timeout=20000 # 20 seconds timeout export OCF_RESKEY_CRM_meta_interval=10000 # reset this for probes num_errors=0 info() { [ "$quiet" -eq 1 ] && return echo "$*" } debug() { [ "$verbose" -eq 0 ] && return echo "$*" } usage() { # make sure to output errors on stderr [ "x$1" = "x0" ] || exec >&2 echo "Tool for testing if a cluster resource is OCF compliant" echo "" echo "Usage: ocf-tester [-LhvqdX] -n resource_name [-o name=value]* /full/path/to/resource/agent" echo "" echo "Options:" echo " -h This text" echo " -v Be verbose while testing" echo " -q Be quiet while testing" echo " -d Turn on RA debugging" echo " -X Turn on RA tracing (expect large output)" echo " -n name Name of the resource" echo " -o name=value Name and value of any parameters required by the agent" echo " -L Use lrmadmin/lrmd for tests" exit $1 } assert() { rc=$1; shift target=$1; shift msg=$1; shift local targetrc matched if [ $# = 0 ]; then exit_code=0 else exit_code=$1; shift fi for targetrc in `echo $target | tr ':' ' '`; do [ $rc -eq $targetrc ] && matched=1 done if [ "$matched" != 1 ]; then num_errors=`expr $num_errors + 1` echo "* rc=$rc: $msg" if [ $exit_code != 0 ]; then [ -n "$command_output" ] && cat<&2; usage 1;; *) done=1;; esac done if [ "x" = "x$OCF_ROOT" ]; then if [ -d /usr/lib/ocf ]; then export OCF_ROOT=/usr/lib/ocf else echo "You must supply the location of OCF_ROOT (common location is /usr/lib/ocf)" >&2 usage 1 fi fi if [ "x" = "x$OCF_RESOURCE_INSTANCE" ]; then echo "You must give your resource a name, set OCF_RESOURCE_INSTANCE" >&2 usage 1 fi agent=$1 if [ ! -e $agent ]; then echo "You must provide the full path to your resource agent" >&2 usage 1 fi installed_rc=5 stopped_rc=7 has_demote=1 has_promote=1 start_lrmd() { lrmd_timeout=0 lrmd_interval=0 lrmd_target_rc=EVERYTIME lrmd_started="" $LRMD -s 2>/dev/null rc=$? if [ $rc -eq 3 ]; then lrmd_started=1 $LRMD & sleep 1 $LRMD -s 2>/dev/null else return $rc fi } add_resource() { $LRMADMIN -A $OCF_RESOURCE_INSTANCE \ ocf \ `basename $agent` \ $(basename `dirname $agent`) \ $lrm_ra_args > /dev/null } del_resource() { $LRMADMIN -D $OCF_RESOURCE_INSTANCE } parse_lrmadmin_output() { awk ' BEGIN{ rc=1; } /Waiting for lrmd to callback.../ { n=1; next; } n==1 && /----------------operation--------------/ { n++; next; } n==2 && /return code:/ { rc=$0; sub("return code: *","",rc); next } n==2 && /---------------------------------------/ { n++; next; } END{ if( n!=3 ) exit 1; else exit rc; } ' } exec_resource() { op="$1" args="$2" $LRMADMIN -E $OCF_RESOURCE_INSTANCE \ $op $lrmd_timeout $lrmd_interval \ $lrmd_target_rc \ $args | parse_lrmadmin_output } if [ "$use_lrmd" = 1 ]; then echo "Using lrmd/lrmadmin for all tests" start_lrmd || { echo "could not start lrmd" >&2 exit 1 } trap ' [ "$lrmd_started" = 1 ] && $LRMD -k ' EXIT add_resource || { echo "failed to add resource to lrmd" >&2 exit 1 } fi lrm_test_command() { action="$1" msg="$2" debug "$msg" exec_resource $action "$lrm_ra_args" } test_permissions() { action=meta-data debug ${1:-"Testing permissions with uid nobody"} - su nobody -s /bin/sh $agent $action > /dev/null + su nobody -s /bin/sh -c "$agent $action" > /dev/null } test_metadata() { action=meta-data msg=${1:-"Testing: $action"} debug $msg $agent $action | (cd $DATADIR/resource-agents && $METADATA_LINT) rc=$? #echo rc: $rc return $rc } test_command() { action=$1; shift export __OCF_ACTION=$action msg=${1:-"Testing: $action"} if [ "$use_lrmd" = 1 ]; then lrm_test_command $action "$msg" return $? fi #echo Running: "export $ra_args; $agent $action 2>&1 > /dev/null" if [ $verbose -eq 0 ]; then command_output=`$agent $action 2>&1` else debug $msg $agent $action fi rc=$? #echo rc: $rc return $rc } # Begin tests info "Beginning tests for $agent..." if [ ! -f $agent ]; then assert 7 0 "Could not find file: $agent" fi if [ `id -u` = 0 ]; then test_permissions assert $? 0 "Your agent has too restrictive permissions: should be 755" else echo "WARN: Can't check agent's permissions because we're not root; they should be 755" fi test_metadata assert $? 0 "Your agent produces meta-data which does not conform to ra-api-1.dtd" OCF_TESTER_FAIL_HAVE_BINARY=1 export OCF_TESTER_FAIL_HAVE_BINARY test_command meta-data rc=$? if [ $rc -eq 3 ]; then assert $rc 0 "Your agent does not support the meta-data action" else assert $rc 0 "The meta-data action cannot fail and must return 0" fi unset OCF_TESTER_FAIL_HAVE_BINARY ra_args="export $ra_args" eval $ra_args test_command validate-all rc=$? if [ $rc -eq 3 ]; then assert $rc 0 "Your agent does not support the validate-all action" elif [ $rc -ne 0 ]; then assert $rc 0 "Validation failed. Did you supply enough options with -o ?" 1 usage $rc fi test_command monitor "Checking current state" rc=$? if [ $rc -eq 3 ]; then assert $rc 7 "Your agent does not support the monitor action" 1 elif [ $rc -eq 8 ]; then test_command demote "Cleanup, demote" assert $? 0 "Your agent was a master and could not be demoted" 1 test_command stop "Cleanup, stop" assert $? 0 "Your agent was a master and could not be stopped" 1 elif [ $rc -ne 7 ]; then test_command stop assert $? 0 "Your agent was active and could not be stopped" 1 fi test_command monitor assert $? $stopped_rc "Monitoring a stopped resource should return $stopped_rc" OCF_TESTER_FAIL_HAVE_BINARY=1 export OCF_TESTER_FAIL_HAVE_BINARY OCF_RESKEY_CRM_meta_interval=0 test_command monitor assert $? $stopped_rc:$installed_rc "The initial probe for a stopped resource should return $stopped_rc or $installed_rc even if all binaries are missing" unset OCF_TESTER_FAIL_HAVE_BINARY OCF_RESKEY_CRM_meta_interval=20000 test_command start assert $? 0 "Start failed. Did you supply enough options with -o ?" 1 test_command monitor assert $? 0 "Monitoring an active resource should return 0" OCF_RESKEY_CRM_meta_interval=0 test_command monitor assert $? 0 "Probing an active resource should return 0" OCF_RESKEY_CRM_meta_interval=20000 test_command notify rc=$? if [ $rc -eq 3 ]; then info "* Your agent does not support the notify action (optional)" else assert $rc 0 "The notify action cannot fail and must return 0" fi test_command demote "Checking for demote action" if [ $? -eq 3 ]; then has_demote=0 info "* Your agent does not support the demote action (optional)" fi test_command promote "Checking for promote action" if [ $? -eq 3 ]; then has_promote=0 info "* Your agent does not support the promote action (optional)" fi if [ $has_promote -eq 1 -a $has_demote -eq 1 ]; then test_command demote "Testing: demotion of started resource" assert $? 0 "Demoting a start resource should not fail" test_command promote assert $? 0 "Promote failed" test_command demote assert $? 0 "Demote failed" 1 test_command demote "Testing: demotion of demoted resource" assert $? 0 "Demoting a demoted resource should not fail" test_command promote "Promoting resource" assert $? 0 "Promote failed" 1 test_command promote "Testing: promotion of promoted resource" assert $? 0 "Promoting a promoted resource should not fail" test_command demote "Demoting resource" assert $? 0 "Demote failed" 1 elif [ $has_promote -eq 0 -a $has_demote -eq 0 ]; then info "* Your agent does not support master/slave (optional)" else echo "* Your agent partially supports master/slave" num_errors=`expr $num_errors + 1` fi test_command stop assert $? 0 "Stop failed" 1 test_command monitor assert $? $stopped_rc "Monitoring a stopped resource should return $stopped_rc" test_command start "Restarting resource..." assert $? 0 "Start failed" 1 test_command monitor assert $? 0 "Monitoring an active resource should return 0" test_command start "Testing: starting a started resource" assert $? 0 "Starting a running resource is required to succeed" test_command monitor assert $? 0 "Monitoring an active resource should return 0" test_command stop "Stopping resource" assert $? 0 "Stop could not clean up after multiple starts" 1 test_command monitor assert $? $stopped_rc "Monitoring a stopped resource should return $stopped_rc" test_command stop "Testing: stopping a stopped resource" assert $? 0 "Stopping a stopped resource is required to succeed" test_command monitor assert $? $stopped_rc "Monitoring a stopped resource should return $stopped_rc" test_command migrate_to "Checking for migrate_to action" rc=$? if [ $rc -ne 3 ]; then test_command migrate_from "Checking for migrate_from action" fi if [ $? -eq 3 ]; then info "* Your agent does not support the migrate action (optional)" fi test_command reload "Checking for reload action" if [ $? -eq 3 ]; then info "* Your agent does not support the reload action (optional)" fi if [ $num_errors -gt 0 ]; then echo "Tests failed: $agent failed $num_errors tests" >&2 exit 1 else echo $agent passed all tests exit 0 fi # vim:et:ts=8:sw=4 diff --git a/tools/ocft/IPaddr2 b/tools/ocft/IPaddr2 index 1cf81bf73..04698a056 100644 --- a/tools/ocft/IPaddr2 +++ b/tools/ocft/IPaddr2 @@ -1,137 +1,137 @@ # IPaddr2 CONFIG Agent IPaddr2 AgentRoot /usr/lib/ocf/resource.d/heartbeat HangTimeout 20 SETUP-AGENT ip addr add 192.168.144.1/24 dev eth0 brd 192.168.144.255 CLEANUP-AGENT ip addr del 192.168.144.1/24 dev eth0 CASE-BLOCK required_args Env OCF_RESKEY_ip=192.168.144.2 CASE-BLOCK check_iflabel_assigned Bash ip -4 -o addr show eth0 | grep -w 192.168.144.2/24 | grep -w eth0:iflabel >/dev/null # checking iflabel was assigned correctly CASE-BLOCK check_iflabel_removed Bash ! ip -4 -o addr show eth0 | grep -w 192.168.144.2/24 | grep -w eth0:iflabel >/dev/null # checking iflabel was removed correctly CASE-BLOCK default_status AgentRun stop CASE-BLOCK prepare Include required_args Include default_status CASE "check base env" Include prepare AgentRun start OCF_SUCCESS CASE "check base env: unset 'OCF_RESKEY_ip'" Include prepare Unenv OCF_RESKEY_ip AgentRun start OCF_ERR_CONFIGURED CASE "check base env: set invalid 'OCF_RESKEY_ip'" Include prepare Env OCF_RESKEY_ip=not_ip_address AgentRun start OCF_ERR_CONFIGURED CASE "check base env: set 'OCF_RESKEY_cidr_netmask'" Include prepare Env OCF_RESKEY_cidr_netmask=24 AgentRun start OCF_SUCCESS CASE "check base env: set invalid 'OCF_RESKEY_cidr_netmask'" Include prepare Env OCF_RESKEY_cidr_netmask=not_netmask AgentRun start OCF_ERR_CONFIGURED CASE "check base env: set 'OCF_RESKEY_broadcast'" Include prepare Env OCF_RESKEY_broadcast=192.168.144.255 AgentRun start OCF_SUCCESS CASE "check base env: set invalid 'OCF_RESKEY_broadcast'" Include prepare Env OCF_RESKEY_broadcast=not_broadcast AgentRun start OCF_ERR_CONFIGURED CASE "check base env: set 'OCF_RESKEY_nic'" Include prepare Env OCF_RESKEY_nic=eth0 AgentRun start OCF_SUCCESS CASE "check base env: set invalid 'OCF_RESKEY_nic'" Include prepare Env OCF_RESKEY_nic=not_nic AgentRun start OCF_ERR_CONFIGURED AgentRun validate-all OCF_ERR_CONFIGURED CASE "normal start" Include prepare AgentRun start OCF_SUCCESS CASE "normal stop" Include prepare AgentRun start AgentRun stop OCF_SUCCESS CASE "double start" Include prepare AgentRun start AgentRun start OCF_SUCCESS CASE "double stop" Include prepare AgentRun stop OCF_SUCCESS CASE "monitor with running" Include prepare AgentRun start AgentRun monitor OCF_SUCCESS CASE "monitor with not running" Include prepare AgentRun monitor OCF_NOT_RUNNING CASE "unimplemented command" Include prepare AgentRun no_cmd OCF_ERR_UNIMPLEMENTED CASE "Attachment to loopback interface" Env OCF_RESKEY_ip=127.0.0.3 AgentRun start OCF_SUCCESS AgentRun monitor OCF_SUCCESS AgentRun stop OCF_SUCCESS CASE "check additional env: set 'OCF_RESKEY_iflabel'" Include prepare Env OCF_RESKEY_nic=eth0 Env OCF_RESKEY_iflabel=iflabel AgentRun start OCF_SUCCESS Include check_iflabel_assigned AgentRun stop OCF_SUCCESS Include check_iflabel_removed # This is deprecated but still supported for the compatibility. CASE "check additional env: specify iflabel in 'OCF_RESKEY_nic'" Include prepare Env OCF_RESKEY_nic=eth0:iflabel AgentRun start OCF_SUCCESS Include check_iflabel_assigned AgentRun stop OCF_SUCCESS Include check_iflabel_removed # monitor should return OCF_ERR_GENERIC rather than OCF_ERR_CONFIGURED -# when the specified OCF_RESKEY_nic is disappeard by a failure. +# when the specified OCF_RESKEY_nic is vanished by a failure. # This has been changed as of 3.9.6. -CASE "monitor failure when 'OCF_RESKEY_nic' is disappeared" +CASE "monitor failure when 'OCF_RESKEY_nic' is vanished" Include prepare - Env OCF_RESKEY_nic=ethDisappear + Env OCF_RESKEY_nic=ethVanished Env OCF_RESKEY_CRM_meta_interval=10 # not in probe AgentRun monitor OCF_ERR_GENERIC diff --git a/tools/ocft/Makefile.am b/tools/ocft/Makefile.am index 8191c11d1..69c59eeec 100644 --- a/tools/ocft/Makefile.am +++ b/tools/ocft/Makefile.am @@ -1,63 +1,63 @@ # Author: John Shi # jshi@suse.de # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in -EXTRA_DIST = $(ocftcfgs_DATA) $(ocft_DATA) +EXTRA_DIST = $(ocftcfgs_DATA) $(ocft_DATA) $(ocft_SCRIPTS) sbin_SCRIPTS = ocft ocftcfgsdir = $(datadir)/$(PACKAGE_NAME)/ocft/configs ocftcfgs_DATA = apache \ IPaddr2 \ IPaddr2v4 \ IPaddr2v6 \ IPv6addr \ Filesystem \ LVM \ Raid1 \ IPsrcaddr \ MailTo \ jboss \ mysql \ mysql-proxy \ pgsql \ db2 \ oracle \ drbd.linbit \ exportfs \ exportfs-multidir \ nfsserver \ portblock \ iscsi \ named \ postfix \ sg_persist \ tomcat \ Xinetd \ Xen \ VirtualDomain \ SendArp ocftdir = $(datadir)/$(PACKAGE_NAME)/ocft ocft_DATA = README \ README.zh_CN \ caselib \ helpers.sh \ runocft.prereq ocft_SCRIPTS = runocft diff --git a/tools/ocft/README.in b/tools/ocft/README.in index c837fcc35..1c4ae1287 100644 --- a/tools/ocft/README.in +++ b/tools/ocft/README.in @@ -1,147 +1,147 @@ INTRODUCTION & DESIGN ~~~~~~~~~~~~~~~~~~~~~ - Ocft is a testing tool for resource agents. Instead of the policy of HA, - it mainly concerns whether resource agents run correct locally. It can - design types of complicated environments to test the reliability of - resource agents. Precisely, it is to display whether resource agents can - return to correct or expected value. The advantage of the tool provides - us with competence to design conditions which can be recorded or reproduced. + it mainly concerns whether resource agents run correct locally. It can + design types of complicated environments to test the reliability of + resource agents. Precisely, it is to display whether resource agents can + return to correct or expected value. The advantage of the tool provides + us with competence to design conditions which can be recorded or reproduced. Hence it is useful to debuggers. * Components ** Test case generator (@sbindir@/ocft) - Turning configuration files of test case to executable scripts. ** Configuration file (@datadir@/@PACKAGE_NAME@/ocft/configs/) - - Every configuration file directs only one resource agent and share the same + - Every configuration file directs only one resource agent and share the same name with resource agent but contains more test cases. ** The testing script (/var/lib/@PACKAGE_NAME@/ocft/cases/) - - After the generator reads configuration files and generates many testing + - After the generator reads configuration files and generates many testing scripts and the script is underway, the test begins. * How to customize the environment of testing - - Ocft designs the running conditions through two ways, one is changing the - environment variables of resource agents (it is the interface left by OCF itself), - the other is modifying the OS environment of resource agents, such as altering + - Ocft designs the running conditions through two ways, one is changing the + environment variables of resource agents (it is the interface left by OCF itself), + the other is modifying the OS environment of resource agents, such as altering the permission of some key file or IP address of the machine. * How to test - - Firstly, you need to sketch the all complex and uncommon environments against - a certain resource agent and keep in mind what consequences may be caused by - these uncommon environments. - Secondly, write the designed conditions and foreknown consequences into - configuration files, and then run the generator to translate the test case to - executable scripts. - Finally, you need running these scripts to observe the output and learn - the running status of each test case, which will compares the predicated result - with the actual one. If they differ, you will be able to find the bugs of the + - Firstly, you need to sketch the all complex and uncommon environments against + a certain resource agent and keep in mind what consequences may be caused by + these uncommon environments. + Secondly, write the designed conditions and foreknown consequences into + configuration files, and then run the generator to translate the test case to + executable scripts. + Finally, you need running these scripts to observe the output and learn + the running status of each test case, which will compares the predicated result + with the actual one. If they differ, you will be able to find the bugs of the resource agent. - All of the output with test will be recorded into the log files, you can find them in /var/lib/@PACKAGE_NAME@/ocft/cases/logs. HOW TO WRITE CONFIGURATION FILE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - There are only 6 top level options that are all spelled by capital letters and "-". + - There are only 6 top level options that are all spelled by capital letters and "-". Every top level option contains sub-options that they are initials. * 'CONFIG' (top level option) - Grammar: CONFIG - The design in this option is global and influences every test case. ** 'Agent' (sub-option) - Grammar: Agent AGENT_NAME - The agent name you want to test. ** 'AgentRoot' (sub-option) - Grammar: AgentRoot /usr/lib/ocf/resource.d/xxx - A few agents will go to "linbit" or "pacemaker" directory, if you define this option, ocft will use it to replace the default directory "heartbeat". ** 'InstallPackage' (sub-option) - Grammar: InstallPackage package [package2 [...]] - - It will test whether the system have installed the service of the resource agent. + - It will test whether the system have installed the service of the resource agent. If not, it will download from Internet and have it installed automatically. ** 'HangTimeout' (sub-option) - Grammar: HangTimeout secs - - If you alter some key options, some resource agents will get puzzled and stop, - which will influence the running of the following test case. Hence timeout setting is + - If you alter some key options, some resource agents will get puzzled and stop, + which will influence the running of the following test case. Hence timeout setting is needed, if the resource agent stops timeout, the scripts will kill this resource agent. * 'VARIABLE' (top level option) - - Garmmar: + - Grammar: VARIABLE VAR1=value1 VAR2=value2 ... - Define the global variable here, the variables can be visited everywhere, they can be referenced using $VAR_NAME. Note, the variables in VARIABLE are different from 'Env VAR1=value1', 'Env' can affect the activity of agent, but the variables in VARIABLE just be shared with top level option. * 'SETUP-AGENT' (top level option) - - Grammar: + - Grammar: SETUP-AGENT bash scripts... ... - - Some of Agents may need to be initialized before testing, you can do it here with bash script. + - Some of Agents may need to be initialized before testing, you can do it here with bash script. * 'CLEANUP-AGENT' (top level option) - - Garmmar: + - Grammar: CLEANUP-AGENT bash scripts... ... - If SETUP-AGENT set, usually you might be use this option do some cleaning work after test. * 'CASE' & 'CASE-BLOCK' (top level option) - Grammar: CASE "description" & CASE-BLOCK macro_name - - Usually, the conditions you designed are more than one and a few 'CASE "..."' will - appear in configuration file. It is worth noting that the following sub-options + - Usually, the conditions you designed are more than one and a few 'CASE "..."' will + appear in configuration file. It is worth noting that the following sub-options have 2 spellings: - One is general, where shell affects the local environment; the other is special, + One is general, where shell affects the local environment; the other is special, where each options added "@ipaddr". It can remotely execute shell codes. In other words, it is to execute the shell codes from a remote host, which is meaningful when a resource agent needs 2 hosts. This remote shell is not a remote execution only through "ssh", but running a remote shell in the background while the test case is running. The remote shell runs in the background till the end and saves the results during the process. That is to - say, you can alternatively carry out local and remote shell code segments. - The "CASE-BLOCK" option is a macro definer, the statements in "CASE-BLOCK" will be inserted + say, you can alternatively carry out local and remote shell code segments. + The "CASE-BLOCK" option is a macro definer, the statements in "CASE-BLOCK" will be inserted into "CASE" if you "Include" the "macro_name". ** 'Env' (sub-option) - Grammar: Env VARIABLE=value - - It is to set up an environment variable of the resource agent. They usually appear to + - It is to set up an environment variable of the resource agent. They usually appear to be OCF_RESKEY_xxx. One point is to be noted is there is no blank by both sides of "=". ** 'Unenv' (sub-option) - - Grammer: Unenv VARIABLE [VARIABLE2 [...]] + - Grammar: Unenv VARIABLE [VARIABLE2 [...]] - Remove the environment variable. ** 'Include' (sub-option) - - Garmmer: Include macro_name - - It will be replaced by statements in 'macro_name', of course, you should define the + - Grammar: Include macro_name + - It will be replaced by statements in 'macro_name', of course, you should define the content of 'macro_name' with 'CASE-BLOCK' first. ** 'Bash' (sub-option) - Grammar: Bash bash_codes - - This option is to set up the environment of OS, where you can insert BASH code to - customize the system randomly. Note, do not cause unrecoverable consequences to the + - This option is to set up the environment of OS, where you can insert BASH code to + customize the system randomly. Note, do not cause unrecoverable consequences to the system. ** 'BashAtExit' (sub-option) - Grammar: BashAtExit bash_codes - - This option is to recover the OS environment in order to run another test case - correctly. Of cause you can use 'Bash' option to recover it. However, if mistakes occur - in the process, the script will quit directly instead of running your recovery codes. - If it happens, you ought to use BashAtExit which can restore the system environment + - This option is to recover the OS environment in order to run another test case + correctly. Of cause you can use 'Bash' option to recover it. However, if mistakes occur + in the process, the script will quit directly instead of running your recovery codes. + If it happens, you ought to use BashAtExit which can restore the system environment before you quit. ** 'AgentRun' (sub-option) - Grammar: AgentRun cmd [ret_value] - - This option is to run resource agent. "cmd" is the parameter of the resource agent, - such as "start, status, stop ...". The second parameter is optional. It will compare the - actual returned value with the expected value when the script has run recourse agent. + - This option is to run resource agent. "cmd" is the parameter of the resource agent, + such as "start, status, stop ...". The second parameter is optional. It will compare the + actual returned value with the expected value when the script has run recourse agent. If differs, bugs will be found. diff --git a/tools/ocft/runocft b/tools/ocft/runocft index f66b6a462..d269a6bba 100755 --- a/tools/ocft/runocft +++ b/tools/ocft/runocft @@ -1,37 +1,38 @@ +#!/bin/sh OCFTDIR=/usr/share/resource-agents/ocft CONFDIR=$OCFTDIR/configs prereq_run() { eval "$@" } prereq_prog() { which $@ } test_prereq() { local tp arg tp=`echo $prereq|sed 's/:.*//'` arg=`echo $prereq|sed 's/[a-z]*://'` prereq_$tp $arg >/dev/null 2>&1 } rm -f ocft.FAILED rc=0 while read f prereq; do if [ -n "$prereq" ] && ! test_prereq; then echo "$f: prerequisite not fulfilled, skipping" continue fi ocft make $f if ! ocft test $f; then echo $f >> ocft.FAILED rc=1 fi done < $OCFTDIR/runocft.prereq if [ -f ocft.FAILED ]; then echo "The following ocft tests failed:" cat ocft.FAILED fi exit $rc diff --git a/tools/tickle_tcp.c b/tools/tickle_tcp.c index cf0bdcb39..7c5a53713 100644 --- a/tools/tickle_tcp.c +++ b/tools/tickle_tcp.c @@ -1,379 +1,379 @@ /* Tickle TCP connections tool Author: Jiaju Zhang Based on the code in CTDB http://ctdb.samba.org/ written by Andrew Tridgell and Ronnie Sahlberg This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define discard_const(ptr) ((void *)((intptr_t)(ptr))) typedef union { struct sockaddr sa; struct sockaddr_in ip; struct sockaddr_in6 ip6; } sock_addr; uint32_t uint16_checksum(uint16_t *data, size_t n); void set_nonblocking(int fd); void set_close_on_exec(int fd); static int parse_ipv4(const char *s, unsigned port, struct sockaddr_in *sin); static int parse_ipv6(const char *s, const char *iface, unsigned port, sock_addr *saddr); int parse_ip(const char *addr, const char *iface, unsigned port, sock_addr *saddr); int parse_ip_port(const char *addr, sock_addr *saddr); int send_tickle_ack(const sock_addr *dst, const sock_addr *src, uint32_t seq, uint32_t ack, int rst); static void usage(void); uint32_t uint16_checksum(uint16_t *data, size_t n) { uint32_t sum=0; while (n >= 2) { sum += (uint32_t)ntohs(*data); data++; n -= 2; } if (n == 1) { sum += (uint32_t)ntohs(*(uint8_t *)data); } return sum; } static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip) { uint32_t sum = uint16_checksum(data, n); uint16_t sum2; sum += uint16_checksum((uint16_t *)(void *)&ip->saddr, sizeof(ip->saddr)); sum += uint16_checksum((uint16_t *)(void *)&ip->daddr, sizeof(ip->daddr)); sum += ip->protocol + n; sum = (sum & 0xFFFF) + (sum >> 16); sum = (sum & 0xFFFF) + (sum >> 16); sum2 = htons(sum); sum2 = ~sum2; if (sum2 == 0) { return 0xFFFF; } return sum2; } static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6) { uint32_t phdr[2]; uint32_t sum = 0; uint16_t sum2; sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16); sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16); phdr[0] = htonl(n); phdr[1] = htonl(ip6->ip6_nxt); sum += uint16_checksum((uint16_t *)phdr, 8); sum += uint16_checksum(data, n); sum = (sum & 0xFFFF) + (sum >> 16); sum = (sum & 0xFFFF) + (sum >> 16); sum2 = htons(sum); sum2 = ~sum2; if (sum2 == 0) { return 0xFFFF; } return sum2; } void set_nonblocking(int fd) { unsigned v; v = fcntl(fd, F_GETFL, 0); fcntl(fd, F_SETFL, v | O_NONBLOCK); } void set_close_on_exec(int fd) { unsigned v; v = fcntl(fd, F_GETFD, 0); fcntl(fd, F_SETFD, v | FD_CLOEXEC); } static int parse_ipv4(const char *s, unsigned port, struct sockaddr_in *sin) { sin->sin_family = AF_INET; sin->sin_port = htons(port); if (inet_pton(AF_INET, s, &sin->sin_addr) != 1) { fprintf(stderr, "Failed to translate %s into sin_addr\n", s); return -1; } return 0; } static int parse_ipv6(const char *s, const char *iface, unsigned port, sock_addr *saddr) { saddr->ip6.sin6_family = AF_INET6; saddr->ip6.sin6_port = htons(port); saddr->ip6.sin6_flowinfo = 0; saddr->ip6.sin6_scope_id = 0; if (inet_pton(AF_INET6, s, &saddr->ip6.sin6_addr) != 1) { fprintf(stderr, "Failed to translate %s into sin6_addr\n", s); return -1; } if (iface && IN6_IS_ADDR_LINKLOCAL(&saddr->ip6.sin6_addr)) { saddr->ip6.sin6_scope_id = if_nametoindex(iface); } return 0; } int parse_ip(const char *addr, const char *iface, unsigned port, sock_addr *saddr) { char *p; int ret; p = index(addr, ':'); if (!p) ret = parse_ipv4(addr, port, &saddr->ip); else ret = parse_ipv6(addr, iface, port, saddr); return ret; } int parse_ip_port(const char *addr, sock_addr *saddr) { char *s, *p; unsigned port; char *endp = NULL; int ret; s = strdup(addr); if (!s) { fprintf(stderr, "Failed strdup()\n"); return -1; } p = rindex(s, ':'); if (!p) { fprintf(stderr, "This addr: %s does not contain a port number\n", s); free(s); return -1; } port = strtoul(p+1, &endp, 10); if (!endp || *endp != 0) { fprintf(stderr, "Trailing garbage after the port in %s\n", s); free(s); return -1; } *p = 0; ret = parse_ip(s, NULL, port, saddr); free(s); return ret; } int send_tickle_ack(const sock_addr *dst, const sock_addr *src, uint32_t seq, uint32_t ack, int rst) { int s; int ret; uint32_t one = 1; uint16_t tmpport; sock_addr *tmpdest; struct { struct iphdr ip; struct tcphdr tcp; } ip4pkt; struct { struct ip6_hdr ip6; struct tcphdr tcp; } ip6pkt; switch (src->ip.sin_family) { case AF_INET: memset(&ip4pkt, 0, sizeof(ip4pkt)); ip4pkt.ip.version = 4; ip4pkt.ip.ihl = sizeof(ip4pkt.ip)/4; ip4pkt.ip.tot_len = htons(sizeof(ip4pkt)); ip4pkt.ip.ttl = 255; ip4pkt.ip.protocol = IPPROTO_TCP; ip4pkt.ip.saddr = src->ip.sin_addr.s_addr; ip4pkt.ip.daddr = dst->ip.sin_addr.s_addr; ip4pkt.ip.check = 0; ip4pkt.tcp.source = src->ip.sin_port; ip4pkt.tcp.dest = dst->ip.sin_port; ip4pkt.tcp.seq = seq; ip4pkt.tcp.ack_seq = ack; ip4pkt.tcp.ack = 1; if (rst) ip4pkt.tcp.rst = 1; ip4pkt.tcp.doff = sizeof(ip4pkt.tcp)/4; ip4pkt.tcp.window = htons(1234); ip4pkt.tcp.check = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip); - s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW)); + s = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); if (s == -1) { fprintf(stderr, "Failed to open raw socket (%s)\n", strerror(errno)); return -1; } ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one)); if (ret != 0) { fprintf(stderr, "Failed to setup IP headers (%s)\n", strerror(errno)); close(s); return -1; } set_nonblocking(s); set_close_on_exec(s); ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0, (const struct sockaddr *)&dst->ip, sizeof(dst->ip)); close(s); if (ret != sizeof(ip4pkt)) { fprintf(stderr, "Failed sendto (%s)\n", strerror(errno)); return -1; } break; case AF_INET6: memset(&ip6pkt, 0, sizeof(ip6pkt)); ip6pkt.ip6.ip6_vfc = 0x60; ip6pkt.ip6.ip6_plen = htons(20); ip6pkt.ip6.ip6_nxt = IPPROTO_TCP; ip6pkt.ip6.ip6_hlim = 64; ip6pkt.ip6.ip6_src = src->ip6.sin6_addr; ip6pkt.ip6.ip6_dst = dst->ip6.sin6_addr; ip6pkt.tcp.source = src->ip6.sin6_port; ip6pkt.tcp.dest = dst->ip6.sin6_port; ip6pkt.tcp.seq = seq; ip6pkt.tcp.ack_seq = ack; ip6pkt.tcp.ack = 1; if (rst) ip6pkt.tcp.rst = 1; ip6pkt.tcp.doff = sizeof(ip6pkt.tcp)/4; ip6pkt.tcp.window = htons(1234); ip6pkt.tcp.check = tcp_checksum6((uint16_t *)&ip6pkt.tcp, sizeof(ip6pkt.tcp), &ip6pkt.ip6); s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW); if (s == -1) { fprintf(stderr, "Failed to open sending socket\n"); return -1; } tmpdest = discard_const(dst); tmpport = tmpdest->ip6.sin6_port; tmpdest->ip6.sin6_port = 0; ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0, (const struct sockaddr *)&dst->ip6, sizeof(dst->ip6)); tmpdest->ip6.sin6_port = tmpport; close(s); if (ret != sizeof(ip6pkt)) { fprintf(stderr, "Failed sendto (%s)\n", strerror(errno)); return -1; } break; default: fprintf(stderr, "Not an ipv4/v6 address\n"); return -1; } return 0; } static void usage(void) { printf("Usage: /usr/lib/heartbeat/tickle_tcp [ -n num ]\n"); printf("Please note that this program need to read the list of\n"); printf("{local_ip:port remote_ip:port} from stdin.\n"); exit(1); } #define OPTION_STRING "n:h" int main(int argc, char *argv[]) { int optchar, i, num = 1, cont = 1; sock_addr src, dst; char addrline[128], addr1[64], addr2[64]; while(cont) { optchar = getopt(argc, argv, OPTION_STRING); switch(optchar) { case 'n': num = atoi(optarg); break; case 'h': usage(); exit(EXIT_SUCCESS); break; case EOF: cont = 0; break; default: fprintf(stderr, "unknown option, please use '-h' for usage.\n"); exit(EXIT_FAILURE); break; }; } while(fgets(addrline, sizeof(addrline), stdin)) { sscanf(addrline, "%s %s", addr1, addr2); if (parse_ip_port(addr1, &src)) { fprintf(stderr, "Bad IP:port '%s'\n", addr1); return -1; } if (parse_ip_port(addr2, &dst)) { fprintf(stderr, "Bad IP:port '%s'\n", addr2); return -1; } for (i = 1; i <= num; i++) { if (send_tickle_ack(&dst, &src, 0, 0, 0)) { fprintf(stderr, "Error while sending tickle ack from '%s' to '%s'\n", addr1, addr2); return -1; } } } return 0; }