diff --git a/heartbeat/IPv6addr.c b/heartbeat/IPv6addr.c index 255ce2166..68447de2e 100644 --- a/heartbeat/IPv6addr.c +++ b/heartbeat/IPv6addr.c @@ -1,876 +1,876 @@ /* * This program manages IPv6 address with OCF Resource Agent standard. * * Author: Huang Zhen * Copyright (c) 2004 International Business Machines * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* * It can add an IPv6 address, or remove one. * * Usage: IPv6addr {start|stop|status|monitor|meta-data} * * The "start" arg adds an IPv6 address. * The "stop" arg removes one. * The "status" arg shows whether the IPv6 address exists * The "monitor" arg shows whether the IPv6 address can be pinged (ICMPv6 ECHO) * The "meta_data" arg shows the meta data(XML) */ /* * ipv6-address: * * currently the following forms are legal: * address * address/prefix * * E.g. * 3ffe:ffff:0:f101::3 * 3ffe:ffff:0:f101::3/64 * * It should be passed by environment variant: * OCF_RESKEY_ipv6addr=3ffe:ffff:0:f101::3 * OCF_RESKEY_cidr_netmask=64 * OCF_RESKEY_nic=eth0 * */ /* * start: * 1.IPv6addr will choice a proper interface for the new address. * 2.Then assign the new address to the interface. * 3.Wait until the new address is available (reply ICMPv6 ECHO packet) * 4.Send out the unsolicited advertisements. * * return 0(OCF_SUCCESS) for success * return 1(OCF_ERR_GENERIC) for failure * return 2(OCF_ERR_ARGS) for invalid or excess argument(s) * * * stop: * remove the address from the inferface. * * return 0(OCF_SUCCESS) for success * return 1(OCF_ERR_GENERIC) for failure * return 2(OCF_ERR_ARGS) for invalid or excess argument(s) * * status: * return the status of the address. only check whether it exists. * * return 0(OCF_SUCCESS) for existing * return 1(OCF_NOT_RUNNING) for not existing * return 2(OCF_ERR_ARGS) for invalid or excess argument(s) * * * monitor: * ping the address by ICMPv6 ECHO request. * * return 0(OCF_SUCCESS) for response correctly. * return 1(OCF_NOT_RUNNING) for no response. * return 2(OCF_ERR_ARGS) for invalid or excess argument(s) */ #include #include #include #include #include #include #include #include #include #include /* for inet_pton */ #include /* for if_nametoindex */ #include #include #include #include #include #include #include #include #define PIDFILE_BASE HA_RSCTMPDIR "/IPv6addr-" /* 0 No error, action succeeded completely 1 generic or unspecified error (current practice) The "monitor" operation shall return this for a crashed, hung or otherwise non-functional resource. 2 invalid or excess argument(s) Likely error code for validate-all, if the instance parameters do not validate. Any other action is free to also return this exit status code for this case. 3 unimplemented feature (for example, "reload") 4 user had insufficient privilege 5 program is not installed 6 program is not configured 7 program is not running 8 resource is running in "master" mode and fully operational 9 resource is in "master" mode but in a failed state */ #define OCF_SUCCESS 0 #define OCF_ERR_GENERIC 1 #define OCF_ERR_ARGS 2 #define OCF_ERR_UNIMPLEMENTED 3 #define OCF_ERR_PERM 4 #define OCF_ERR_INSTALLED 5 #define OCF_ERR_CONFIGURED 6 #define OCF_NOT_RUNNING 7 const char* APP_NAME = "IPv6addr"; const char* START_CMD = "start"; const char* STOP_CMD = "stop"; const char* STATUS_CMD = "status"; const char* MONITOR_CMD = "monitor"; const char* ADVT_CMD = "advt"; const char* RECOVER_CMD = "recover"; const char* RELOAD_CMD = "reload"; const char* META_DATA_CMD = "meta-data"; const char* VALIDATE_CMD = "validate-all"; const int QUERY_COUNT = 5; struct in6_ifreq { struct in6_addr ifr6_addr; uint32_t ifr6_prefixlen; unsigned int ifr6_ifindex; }; static int start_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname); static int stop_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname); static int status_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname); static int monitor_addr6(struct in6_addr* addr6, int prefix_len); static int advt_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname); static int meta_data_addr6(void); static void usage(const char* self); int write_pid_file(const char *pid_file); int create_pid_directory(const char *pid_file); static void byebye(int nsig); static char* scan_if(struct in6_addr* addr_target, int* plen_target, int use_mask, char* prov_ifname); static char* find_if(struct in6_addr* addr_target, int* plen_target, char* prov_ifname); static char* get_if(struct in6_addr* addr_target, int* plen_target, char* prov_ifname); static int assign_addr6(struct in6_addr* addr6, int prefix_len, char* if_name); static int unassign_addr6(struct in6_addr* addr6, int prefix_len, char* if_name); int is_addr6_available(struct in6_addr* addr6); int main(int argc, char* argv[]) { char pid_file[256]; char* ipv6addr; char* cidr_netmask; int ret; char* cp; char* prov_ifname = NULL; int prefix_len = -1; struct in6_addr addr6; /* Check the count of parameters first */ if (argc < 2) { usage(argv[0]); return OCF_ERR_ARGS; } /* set termination signal */ siginterrupt(SIGTERM, 1); signal(SIGTERM, byebye); /* open system log */ cl_log_set_entity(APP_NAME); cl_log_set_facility(LOG_DAEMON); /* the meta-data dont need any parameter */ if (0 == strncmp(META_DATA_CMD, argv[1], strlen(META_DATA_CMD))) { ret = meta_data_addr6(); return OCF_SUCCESS; } /* check the OCF_RESKEY_ipv6addr parameter, should be an IPv6 address */ ipv6addr = getenv("OCF_RESKEY_ipv6addr"); if (ipv6addr == NULL) { cl_log(LOG_ERR, "Please set OCF_RESKEY_ipv6addr to the IPv6 address you want to manage."); usage(argv[0]); return OCF_ERR_ARGS; } /* legacy option */ if ((cp = strchr(ipv6addr, '/'))) { prefix_len = atol(cp + 1); if ((prefix_len < 0) || (prefix_len > 128)) { cl_log(LOG_ERR, "Invalid prefix_len [%s], should be an integer in [0, 128]", cp+1); usage(argv[0]); return OCF_ERR_ARGS; } *cp=0; } /* get provided netmask (optional) */ cidr_netmask = getenv("OCF_RESKEY_cidr_netmask"); if (cidr_netmask != NULL) { if ((atol(cidr_netmask) < 0) || (atol(cidr_netmask) > 128)) { cl_log(LOG_ERR, "Invalid prefix_len [%s], " "should be an integer in [0, 128]", cidr_netmask); usage(argv[0]); return OCF_ERR_ARGS; } if (prefix_len != -1 && prefix_len != atol(cidr_netmask)) { cl_log(LOG_DEBUG, "prefix_len(%d) is overwritted by cidr_netmask(%s)", prefix_len, cidr_netmask); } prefix_len = atol(cidr_netmask); } else if (prefix_len == -1) { prefix_len = 0; } /* get provided interface name (optional) */ prov_ifname = getenv("OCF_RESKEY_nic"); if (inet_pton(AF_INET6, ipv6addr, &addr6) <= 0) { cl_log(LOG_ERR, "Invalid IPv6 address [%s]", ipv6addr); usage(argv[0]); return OCF_ERR_ARGS; } /* Check whether this system supports IPv6 */ if (access(IF_INET6, R_OK)) { cl_log(LOG_ERR, "No support for INET6 on this system."); return OCF_ERR_GENERIC; } /* create the pid file so we can make sure that only one IPv6addr * for this address is running */ if (snprintf(pid_file, sizeof(pid_file), "%s%s", PIDFILE_BASE, ipv6addr) >= (int)sizeof(pid_file)) { cl_log(LOG_ERR, "Pid file truncated"); return OCF_ERR_GENERIC; } if (write_pid_file(pid_file) < 0) { return OCF_ERR_GENERIC; } /* switch the command */ if (0 == strncmp(START_CMD,argv[1], strlen(START_CMD))) { ret = start_addr6(&addr6, prefix_len, prov_ifname); }else if (0 == strncmp(STOP_CMD,argv[1], strlen(STOP_CMD))) { ret = stop_addr6(&addr6, prefix_len, prov_ifname); }else if (0 == strncmp(STATUS_CMD,argv[1], strlen(STATUS_CMD))) { ret = status_addr6(&addr6, prefix_len, prov_ifname); }else if (0 ==strncmp(MONITOR_CMD,argv[1], strlen(MONITOR_CMD))) { ret = monitor_addr6(&addr6, prefix_len); }else if (0 ==strncmp(RELOAD_CMD,argv[1], strlen(RELOAD_CMD))) { ret = OCF_ERR_UNIMPLEMENTED; }else if (0 ==strncmp(RECOVER_CMD,argv[1], strlen(RECOVER_CMD))) { ret = OCF_ERR_UNIMPLEMENTED; }else if (0 ==strncmp(VALIDATE_CMD,argv[1], strlen(VALIDATE_CMD))) { /* ipv6addr has been validated by inet_pton, hence a valid IPv6 address */ ret = OCF_SUCCESS; }else if (0 ==strncmp(ADVT_CMD,argv[1], strlen(MONITOR_CMD))) { ret = advt_addr6(&addr6, prefix_len, prov_ifname); }else{ usage(argv[0]); ret = OCF_ERR_ARGS; } /* release the pid file */ unlink(pid_file); return ret; } int start_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname) { int i; char* if_name; if(OCF_SUCCESS == status_addr6(addr6,prefix_len,prov_ifname)) { return OCF_SUCCESS; } /* we need to find a proper device to assign the address */ if_name = find_if(addr6, &prefix_len, prov_ifname); if (NULL == if_name) { cl_log(LOG_ERR, "no valid mechanisms"); return OCF_ERR_GENERIC; } /* Assign the address */ if (0 != assign_addr6(addr6, prefix_len, if_name)) { cl_log(LOG_ERR, "failed to assign the address to %s", if_name); return OCF_ERR_GENERIC; } /* Check whether the address available */ for (i = 0; i < QUERY_COUNT; i++) { if (0 == is_addr6_available(addr6)) { break; } sleep(1); } if (i == QUERY_COUNT) { cl_log(LOG_ERR, "failed to ping the address"); return OCF_ERR_GENERIC; } /* Send unsolicited advertisement packet to neighbor */ for (i = 0; i < UA_REPEAT_COUNT; i++) { send_ua(addr6, if_name); sleep(1); } return OCF_SUCCESS; } int advt_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname) { /* First, we need to find a proper device to assign the address */ char* if_name = get_if(addr6, &prefix_len, prov_ifname); int i; if (NULL == if_name) { cl_log(LOG_ERR, "no valid mechanisms"); return OCF_ERR_GENERIC; } /* Send unsolicited advertisement packet to neighbor */ for (i = 0; i < UA_REPEAT_COUNT; i++) { send_ua(addr6, if_name); sleep(1); } return OCF_SUCCESS; } int stop_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname) { char* if_name; if(OCF_NOT_RUNNING == status_addr6(addr6,prefix_len,prov_ifname)) { return OCF_SUCCESS; } if_name = get_if(addr6, &prefix_len, prov_ifname); if (NULL == if_name) { cl_log(LOG_ERR, "no valid mechanisms."); /* I think this should be a success exit according to LSB. */ return OCF_ERR_GENERIC; } /* Unassign the address */ if (0 != unassign_addr6(addr6, prefix_len, if_name)) { cl_log(LOG_ERR, "failed to assign the address to %s", if_name); return OCF_ERR_GENERIC; } return OCF_SUCCESS; } int status_addr6(struct in6_addr* addr6, int prefix_len, char* prov_ifname) { char* if_name = get_if(addr6, &prefix_len, prov_ifname); if (NULL == if_name) { return OCF_NOT_RUNNING; } return OCF_SUCCESS; } int monitor_addr6(struct in6_addr* addr6, int prefix_len) { if(0 == is_addr6_available(addr6)) { return OCF_SUCCESS; } return OCF_NOT_RUNNING; } /* find the network interface associated with an address */ char* scan_if(struct in6_addr* addr_target, int* plen_target, int use_mask, char* prov_ifname) { FILE *f; static char devname[21]=""; struct in6_addr addr; struct in6_addr mask; unsigned int plen, scope, dad_status, if_idx; unsigned int addr6p[4]; /* open /proc/net/if_inet6 file */ if ((f = fopen(IF_INET6, "r")) == NULL) { return NULL; } /* Loop for each entry */ while (1) { int i; int n; int s; gboolean same = TRUE; i = fscanf(f, "%08x%08x%08x%08x %x %02x %02x %02x %20s\n", &addr6p[0], &addr6p[1], &addr6p[2], &addr6p[3], &if_idx, &plen, &scope, &dad_status, devname); if (i == EOF) { break; } else if (i != 9) { cl_log(LOG_INFO, "Error parsing %s, " "perhaps the format has changed\n", IF_INET6); break; } /* Consider link-local addresses (scope == 0x20) only when * the inerface name is provided, and global addresses * (scope == 0). Skip everything else. */ if (scope != 0) { if (scope != 0x20 || prov_ifname == 0 || *prov_ifname == 0) continue; } /* If specified prefix, only same prefix entry * would be considered. */ if (*plen_target!=0 && plen != *plen_target) { continue; } /* If interface name provided, only same devname entry * would be considered */ if (prov_ifname!=0 && *prov_ifname!=0) { if (strcmp(devname, prov_ifname)) continue; } for (i = 0; i< 4; i++) { addr.s6_addr32[i] = htonl(addr6p[i]); } /* Make the mask based on prefix length */ memset(mask.s6_addr, 0xff, 16); if (use_mask && plen < 128) { n = plen / 32; memset(mask.s6_addr32 + n + 1, 0, (3 - n) * 4); s = 32 - plen % 32; if (s == 32) mask.s6_addr32[n] = 0x0; else mask.s6_addr32[n] = 0xffffffff << s; mask.s6_addr32[n] = htonl(mask.s6_addr32[n]); } /* compare addr and addr_target */ same = TRUE; for (i = 0; i < 4; i++) { if ((addr.s6_addr32[i]&mask.s6_addr32[i]) != (addr_target->s6_addr32[i]&mask.s6_addr32[i])) { same = FALSE; break; } } /* We found it! */ if (same) { fclose(f); *plen_target = plen; return devname; } } fclose(f); return NULL; } /* find a proper network interface to assign the address */ char* find_if(struct in6_addr* addr_target, int* plen_target, char* prov_ifname) { char *best_ifname = scan_if(addr_target, plen_target, 1, prov_ifname); /* use the provided ifname and prefix if the address did not match */ if (best_ifname == NULL && prov_ifname != 0 && *prov_ifname != 0 && *plen_target != 0) { cl_log(LOG_INFO, "Could not find a proper interface by the ipv6addr. Using the specified nic:'%s' and cidr_netmask:'%d'", prov_ifname, *plen_target); return prov_ifname; } return best_ifname; } /* get the device name and the plen_target of a special address */ char* get_if(struct in6_addr* addr_target, int* plen_target, char* prov_ifname) { return scan_if(addr_target, plen_target, 0, prov_ifname); } int assign_addr6(struct in6_addr* addr6, int prefix_len, char* if_name) { struct in6_ifreq ifr6; /* Get socket first */ int fd; struct ifreq ifr; fd = socket(AF_INET6, SOCK_DGRAM, 0); if (fd < 0) { return 1; } /* Query the index of the if */ strcpy(ifr.ifr_name, if_name); if (ioctl(fd, SIOGIFINDEX, &ifr) < 0) { return -1; } /* Assign the address to the if */ ifr6.ifr6_addr = *addr6; ifr6.ifr6_ifindex = ifr.ifr_ifindex; ifr6.ifr6_prefixlen = prefix_len; if (ioctl(fd, SIOCSIFADDR, &ifr6) < 0) { return -1; } close (fd); return 0; } int unassign_addr6(struct in6_addr* addr6, int prefix_len, char* if_name) { int fd; struct ifreq ifr; struct in6_ifreq ifr6; /* Get socket first */ fd = socket(AF_INET6, SOCK_DGRAM, 0); if (fd < 0) { return 1; } /* Query the index of the if */ strcpy(ifr.ifr_name, if_name); if (ioctl(fd, SIOGIFINDEX, &ifr) < 0) { return -1; } /* Unassign the address to the if */ ifr6.ifr6_addr = *addr6; ifr6.ifr6_ifindex = ifr.ifr_ifindex; ifr6.ifr6_prefixlen = prefix_len; if (ioctl(fd, SIOCDIFADDR, &ifr6) < 0) { return -1; } close (fd); return 0; } #define MINPACKSIZE 64 int is_addr6_available(struct in6_addr* addr6) { struct sockaddr_in6 addr; struct icmp6_hdr icmph; u_char outpack[MINPACKSIZE]; int icmp_sock; int ret; struct iovec iov; u_char packet[MINPACKSIZE]; struct msghdr msg; if ((icmp_sock = socket(AF_INET6, SOCK_RAW, IPPROTO_ICMPV6)) == -1) { return -1; } memset(&icmph, 0, sizeof(icmph)); icmph.icmp6_type = ICMP6_ECHO_REQUEST; icmph.icmp6_code = 0; icmph.icmp6_cksum = 0; icmph.icmp6_seq = htons(0); icmph.icmp6_id = 0; memset(&outpack, 0, sizeof(outpack)); memcpy(&outpack, &icmph, sizeof(icmph)); memset(&addr, 0, sizeof(struct sockaddr_in6)); addr.sin6_family = AF_INET6; addr.sin6_port = htons(IPPROTO_ICMPV6); memcpy(&addr.sin6_addr,addr6,sizeof(struct in6_addr)); /* Only the first 8 bytes of outpack are meaningful... */ ret = sendto(icmp_sock, (char *)outpack, sizeof(outpack), 0, (struct sockaddr *) &addr, sizeof(struct sockaddr_in6)); if (0 >= ret) { return -1; } iov.iov_base = (char *)packet; iov.iov_len = sizeof(packet); msg.msg_name = &addr; msg.msg_namelen = sizeof(addr); msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_controllen = 0; ret = recvmsg(icmp_sock, &msg, MSG_DONTWAIT); if (0 >= ret) { return -1; } return 0; } static void usage(const char* self) { printf("usage: %s {start|stop|status|monitor|validate-all|meta-data}\n",self); return; } /* Following code is copied from send_arp.c, linux-HA project. */ void byebye(int nsig) { (void)nsig; /* Avoid an "error exit" log message if we're killed */ exit(0); } int create_pid_directory(const char *pid_file) { int status; int return_status = -1; struct stat stat_buf; char* dir; dir = strdup(pid_file); if (!dir) { cl_log(LOG_INFO, "Memory allocation failure: %s", strerror(errno)); return -1; } dirname(dir); status = stat(dir, &stat_buf); if (status < 0 && errno != ENOENT && errno != ENOTDIR) { cl_log(LOG_INFO, "Could not stat pid-file directory " "[%s]: %s", dir, strerror(errno)); goto err; } if (!status) { if (S_ISDIR(stat_buf.st_mode)) { goto out; } cl_log(LOG_INFO, "Pid-File directory exists but is " "not a directory [%s]", dir); goto err; } if (mkdir(dir, S_IRUSR|S_IWUSR|S_IXUSR | S_IRGRP|S_IXGRP) < 0) { cl_log(LOG_INFO, "Could not create pid-file directory " "[%s]: %s", dir, strerror(errno)); goto err; } out: return_status = 0; err: free(dir); return return_status; } int write_pid_file(const char *pid_file) { int pidfilefd; char pidbuf[11]; unsigned long pid; ssize_t bytes; if (*pid_file != '/') { cl_log(LOG_INFO, "Invalid pid-file name, must begin with a " "'/' [%s]\n", pid_file); return -1; } if (create_pid_directory(pid_file) < 0) { return -1; } while (1) { pidfilefd = open(pid_file, O_CREAT|O_EXCL|O_RDWR, S_IRUSR|S_IWUSR); if (pidfilefd < 0) { if (errno != EEXIST) { /* Old PID file */ cl_log(LOG_INFO, "Could not open pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } } else { break; } pidfilefd = open(pid_file, O_RDONLY, S_IRUSR|S_IWUSR); if (pidfilefd < 0) { cl_log(LOG_INFO, "Could not open pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } while (1) { bytes = read(pidfilefd, pidbuf, sizeof(pidbuf)-1); if (bytes < 0) { if (errno == EINTR) { continue; } cl_log(LOG_INFO, "Could not read pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } pidbuf[bytes] = '\0'; break; } if(unlink(pid_file) < 0) { cl_log(LOG_INFO, "Could not delete pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } if (!bytes) { cl_log(LOG_INFO, "Invalid pid in pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } close(pidfilefd); pid = strtoul(pidbuf, NULL, 10); if (pid == ULONG_MAX && errno == ERANGE) { cl_log(LOG_INFO, "Invalid pid in pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } if (kill(pid, SIGKILL) < 0 && errno != ESRCH) { - cl_log(LOG_INFO, "Error killing old proccess [%lu] " + cl_log(LOG_INFO, "Error killing old process [%lu] " "from pid-file [%s]: %s", pid, pid_file, strerror(errno)); return -1; } cl_log(LOG_INFO, "Killed old send_ua process [%lu]", pid); } if (snprintf(pidbuf, sizeof(pidbuf), "%u" , getpid()) >= (int)sizeof(pidbuf)) { cl_log(LOG_INFO, "Pid too long for buffer [%u]", getpid()); return -1; } while (1) { bytes = write(pidfilefd, pidbuf, strlen(pidbuf)); if (bytes != strlen(pidbuf)) { if (bytes < 0 && errno == EINTR) { continue; } cl_log(LOG_INFO, "Could not write pid-file " "[%s]: %s", pid_file, strerror(errno)); return -1; } break; } close(pidfilefd); return 0; } static int meta_data_addr6(void) { const char* meta_data= "\n" "\n" "\n" " 1.0\n" " \n" " This script manages IPv6 alias IPv6 addresses,It can add an IP6\n" " alias, or remove one.\n" " \n" " Manages IPv6 aliases\n" " \n" " \n" " \n" " The IPv6 address this RA will manage \n" " \n" " IPv6 address\n" " \n" " \n" " \n" " \n" " The netmask for the interface in CIDR format. (ie, 24).\n" " The value of this parameter overwrites the value of _prefix_\n" " of ipv6addr parameter.\n" " \n" " Netmask\n" " \n" " \n" " \n" " \n" " The base network interface on which the IPv6 address will\n" " be brought online.\n" " \n" " Network interface\n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" "\n"; printf("%s\n",meta_data); return OCF_SUCCESS; } diff --git a/heartbeat/Raid1 b/heartbeat/Raid1 index 1b9213ac3..0f960b2b4 100755 --- a/heartbeat/Raid1 +++ b/heartbeat/Raid1 @@ -1,570 +1,570 @@ #!/bin/sh # # # License: GNU General Public License (GPL) # Support: users@clusterlabs.org # # Raid1 # Description: Manages a Linux software RAID device on a shared storage medium. # Original Author: Eric Z. Ayers (eric.ayers@compgen.com) # Original Release: 25 Oct 2000 # RAID patches: http://people.redhat.com/mingo/raid-patches/ # Word to the Wise: http://lwn.net/2000/0810/a/raid-faq.php3 # Sympathetic Ear: mailto:linux-raid@vger.kernel.org # # usage: $0 {start|stop|status|monitor|validate-all|usage|meta-data} # # # EXAMPLE config file /etc/raidtab.md0 # This file must exist on both machines! # # raiddev /dev/md0 # raid-level 1 # nr-raid-disks 2 # chunk-size 64k # persistent-superblock 1 # #nr-spare-disks 0 # device /dev/sda1 # raid-disk 0 # device /dev/sdb1 # raid-disk 1 # # EXAMPLE config file /etc/mdadm.conf (for more info:man mdadm.conf) # # DEVICE /dev/sdb1 /dev/sdc1 # ARRAY /dev/md0 UUID=4a865b55:ba27ef8d:29cd5701:6fb42799 ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### usage() { cat <<-EOT usage: $0 {start|stop|status|monitor|validate-all|usage|meta-data} EOT } meta_data() { cat < 1.0 This resource agent manages Linux software RAID (MD) devices on a shared storage medium. It uses mdadm(8) to start, stop, and monitor the MD devices. Raidtools are supported, but deprecated. See https://raid.wiki.kernel.org/index.php/Linux_Raid for more information. Manages Linux software RAID (MD) devices on shared storage The RAID configuration file, e.g. /etc/mdadm.conf. RAID config file One or more block devices to use, space separated. Alternatively, set to "auto" to manage all devices specified in raidconf. block device The value for the homehost directive; this is an mdadm feature to protect RAIDs against being activated by accident. It is recommended to create RAIDs managed by the cluster with "homehost" set to a special -value, so they are not accidentially auto-assembled by nodes not +value, so they are not accidentally auto-assembled by nodes not supposed to own them. Homehost for mdadm If processes or kernel threads are using the array, it cannot be stopped. We will try to stop processes, first by sending TERM and then, if that doesn't help in $PROC_CLEANUP_TIME seconds, using KILL. The lsof(8) program is required to get the list of array users. Of course, the kernel threads cannot be stopped this way. If the processes are critical for data integrity, then set this parameter to false. Note that in that case the stop operation will fail and the node will be fenced. force stop processes using the array Wait until udevd creates a device in the start operation. On a normally loaded host this should happen quickly, but you may be unlucky. If you are not using udev set this to "no". udev Activating the same md RAID array on multiple nodes at the same time will result in data corruption and thus is forbidden by default. A safe example could be an array that is only named identically across all nodes, but is in fact distinct. Only set this to "true" if you know what you are doing! force ability to run as a clone END } udev_settle() { if ocf_is_true $WAIT_FOR_UDEV; then udevadm settle $* fi } list_conf_arrays() { test -f $RAIDCONF || { ocf_exit_reason "$RAIDCONF gone missing!" exit $OCF_ERR_GENERIC } grep ^ARRAY $RAIDCONF | awk '{print $2}' } forall() { local func=$1 local checkall=$2 local mddev rc=0 for mddev in $RAIDDEVS; do $func $mddev rc=$(($rc | $?)) [ "$checkall" = all ] && continue [ $rc -ne 0 ] && return $rc done return $rc } are_arrays_stopped() { local rc mddev for mddev in $RAIDDEVS; do raid1_monitor_one $mddev rc=$? [ $rc -ne $OCF_NOT_RUNNING ] && break done test $rc -eq $OCF_NOT_RUNNING } md_assemble() { local mddev=$1 $MDADM --assemble $mddev --config=$RAIDCONF $MDADM_HOMEHOST udev_settle --exit-if-exists=$mddev } # # START: Start up the RAID device # raid1_start() { local rc raid1_monitor rc=$? if [ $rc -eq $OCF_SUCCESS ]; then # md already online, nothing to do. return $OCF_SUCCESS fi if [ $rc -ne $OCF_NOT_RUNNING ]; then # If the array is in a broken state, this agent doesn't # know how to repair that. ocf_exit_reason "$RAIDDEVS in a broken state; cannot start (rc=$rc)" return $OCF_ERR_GENERIC fi if [ $HAVE_RAIDTOOLS = "true" ]; then # Run raidstart to start up the RAID array $RAIDSTART --configfile $RAIDCONF $MDDEV else forall md_assemble all fi raid1_monitor if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS else ocf_exit_reason "Couldn't start RAID for $RAIDDEVS" return $OCF_ERR_GENERIC fi } # # STOP: stop the RAID device # mark_readonly() { local mddev=$1 local rc ocf_log info "Attempting to mark array $mddev readonly" $MDADM --readonly $mddev --config=$RAIDCONF rc=$? if [ $rc -ne 0 ]; then ocf_exit_reason "Failed to set $mddev readonly (rc=$rc)" fi return $rc } mknod_raid1_stop() { # first create a block device file, then try to stop the # array local rc n tmp_block_file n=`echo $1 | sed 's/[^0-9]*//'` if ! ocf_is_decimal "$n"; then ocf_log warn "could not get the minor device number from $1" return 1 fi tmp_block_file="$HA_RSCTMP/${OCF_RESOURCE_INSTANCE}-`basename $1`" rm -f $tmp_block_file ocf_log info "block device file $1 missing, creating one in order to stop the array" mknod $tmp_block_file b 9 $n $MDADM --stop $tmp_block_file --config=$RAIDCONF rc=$? rm -f $tmp_block_file return $rc } raid1_stop_one() { ocf_log info "Stopping array $1" if [ -b "$1" ]; then $MDADM --stop $1 --config=$RAIDCONF && return else # newer mdadm releases can stop arrays when given the # basename; try that first $MDADM --stop `basename $1` --config=$RAIDCONF && return # otherwise create a block device file mknod_raid1_stop $1 fi } get_users_pids() { local mddev=$1 local outp l ocf_log debug "running lsof to list $mddev users..." outp=`lsof $mddev | tail -n +2` echo "$outp" | awk '{print $2}' | sort -u echo "$outp" | while read l; do ocf_log warn "$l" done } stop_raid_users() { local pids pids=`forall get_users_pids all | sort -u` if [ -z "$pids" ]; then ocf_log warn "lsof reported no users holding arrays" return 2 else ocf_stop_processes TERM $PROC_CLEANUP_TIME $pids fi } stop_arrays() { if [ $HAVE_RAIDTOOLS = "true" ]; then $RAIDSTOP --configfile $RAIDCONF $MDDEV else forall raid1_stop_one all fi } showusers() { local disk for disk; do if have_binary lsof; then ocf_log info "running lsof to list $disk users..." ocf_run -warn lsof $disk fi if [ -d /sys/block/$disk/holders ]; then ocf_log info "ls -l /sys/block/$disk/holders" ocf_run -warn ls -l /sys/block/$disk/holders fi done } raid1_stop() { local rc # See if the MD device is already cleanly stopped: if are_arrays_stopped; then return $OCF_SUCCESS fi # Turn off raid if ! stop_arrays; then if ocf_is_true $FORCESTOP; then if have_binary lsof; then stop_raid_users case $? in 2) false;; *) stop_arrays;; esac else ocf_log warn "install lsof(8) to list users holding the disk" false fi else false fi fi rc=$? if [ $rc -ne 0 ]; then ocf_log warn "Couldn't stop RAID for $RAIDDEVS (rc=$rc)" showusers $RAIDDEVS if [ $HAVE_RAIDTOOLS != "true" ]; then forall mark_readonly all fi return $OCF_ERR_GENERIC fi if are_arrays_stopped; then return $OCF_SUCCESS fi ocf_exit_reason "RAID $RAIDDEVS still active after stop command!" return $OCF_ERR_GENERIC } # # monitor: a less noisy status # raid1_monitor_one() { local mddev=$1 local md= local rc local TRY_READD=0 local pbsize # check if the md device exists first # but not if we are in the stop operation # device existence is important only for the running arrays if [ "$__OCF_ACTION" != "stop" ]; then if [ -h "$mddev" ]; then md=$(ls $mddev -l | awk -F'/' '{print $NF}') elif [ -b "$mddev" ]; then md=$(echo $mddev | sed 's,/dev/,,') else ocf_log info "$mddev is not a block device" return $OCF_NOT_RUNNING fi fi if ! grep -e "^$md[ \t:]" /proc/mdstat >/dev/null ; then ocf_log info "$md not found in /proc/mdstat" return $OCF_NOT_RUNNING fi if [ $HAVE_RAIDTOOLS != "true" ]; then $MDADM --detail --test $mddev >/dev/null 2>&1 ; rc=$? case $rc in 0) ;; 1) ocf_log warn "$mddev has at least one failed device." TRY_READD=1 ;; 2) ocf_exit_reason "$mddev has failed." return $OCF_ERR_GENERIC ;; 4) if [ "$__OCF_ACTION" = "stop" ] ; then # There may be a transient invalid device after # we stop MD due to uevent processing, the # original device is stopped though. return $OCF_NOT_RUNNING else ocf_exit_reason "mdadm failed on $mddev." return $OCF_ERR_GENERIC fi ;; *) ocf_exit_reason "mdadm returned an unknown result ($rc)." return $OCF_ERR_GENERIC ;; esac fi if [ "$__OCF_ACTION" = "monitor" -a "$OCF_RESKEY_CRM_meta_interval" != 0 \ -a $TRY_READD -eq 1 -a $OCF_CHECK_LEVEL -gt 0 ]; then ocf_log info "Attempting recovery sequence to re-add devices on $mddev:" $MDADM $mddev --fail detached $MDADM $mddev --remove failed $MDADM $mddev --re-add missing # TODO: At this stage, there's nothing to actually do # here. Either this worked or it did not. fi pbsize=`(blockdev --getpbsz $mddev || stat -c "%o" $mddev) 2>/dev/null` if [ -z "$pbsize" ]; then ocf_log warn "both blockdev and stat could not get the block size (will use 4k)" pbsize=4096 # try with 4k fi if ! dd if=$mddev count=1 bs=$pbsize of=/dev/null \ iflag=direct >/dev/null 2>&1 ; then ocf_exit_reason "$mddev: I/O error on read" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } raid1_monitor() { forall raid1_monitor_one } # # STATUS: is the raid device online or offline? # raid1_status() { # See if the MD device is online local rc raid1_monitor rc=$? if [ $rc -ne $OCF_SUCCESS ]; then echo "stopped" else echo "running" fi return $rc } raid1_validate_all() { return $OCF_SUCCESS } PROC_CLEANUP_TIME=3 if ( [ $# -ne 1 ] ) then usage exit $OCF_ERR_ARGS fi case "$1" in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; *) ;; esac RAIDCONF="$OCF_RESKEY_raidconf" MDDEV="$OCF_RESKEY_raiddev" FORCESTOP="${OCF_RESKEY_force_stop:-1}" WAIT_FOR_UDEV="${OCF_RESKEY_udev:-1}" if [ -z "$RAIDCONF" ] ; then ocf_exit_reason "Please set OCF_RESKEY_raidconf!" exit $OCF_ERR_CONFIGURED fi if [ ! -r "$RAIDCONF" ] ; then ocf_exit_reason "Configuration file [$RAIDCONF] does not exist, or can not be opened!" exit $OCF_ERR_INSTALLED fi if [ -z "$MDDEV" ] ; then ocf_exit_reason "Please set OCF_RESKEY_raiddev to the Raid device you want to control!" exit $OCF_ERR_CONFIGURED fi if ocf_is_clone && ! ocf_is_true "$OCF_RESKEY_force_clones"; then ocf_exit_reason "md RAID arrays are NOT safe to run as a clone!" ocf_log err "Please read the comment on the force_clones parameter." exit $OCF_ERR_CONFIGURED fi if ocf_is_true $WAIT_FOR_UDEV && ! have_binary udevadm; then if [ "$__OCF_ACTION" = "start" ]; then ocf_log warn "either install udevadm or set udev to false" ocf_log info "setting udev to false!" fi WAIT_FOR_UDEV=0 fi if ! ocf_is_true $WAIT_FOR_UDEV; then export MDADM_NO_UDEV=1 fi if ocf_is_true $FORCESTOP && ! have_binary lsof; then ocf_log warn "Please install lsof(8), we may need it when stopping Raid device! Now continuing anyway ..." fi HAVE_RAIDTOOLS=false if have_binary $MDADM >/dev/null 2>&1 ; then if [ -n "$OCF_RESKEY_homehost" ]; then MDADM_HOMEHOST="--homehost=${OCF_RESKEY_homehost}" else MDADM_HOMEHOST="" fi else check_binary $RAIDSTART HAVE_RAIDTOOLS=true fi if [ $HAVE_RAIDTOOLS = true ]; then if [ "$MDDEV" = "auto" ]; then ocf_exit_reason "autoconf supported only with mdadm!" exit $OCF_ERR_INSTALLED elif [ `echo $MDDEV|wc -w` -gt 1 ]; then ocf_exit_reason "multiple devices supported only with mdadm!" exit $OCF_ERR_INSTALLED fi fi if [ "$MDDEV" = "auto" ]; then RAIDDEVS=`list_conf_arrays` else RAIDDEVS="$MDDEV" fi # At this stage, # [ $HAVE_RAIDTOOLS = false ] <=> we have $MDADM, # otherwise we have raidtools (raidstart and raidstop) # Look for how we are called case "$1" in start) raid1_start ;; stop) raid1_stop ;; status) raid1_status ;; monitor) raid1_monitor ;; validate-all) raid1_validate_all ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/Route b/heartbeat/Route index 42010f8db..67bdf6bfc 100755 --- a/heartbeat/Route +++ b/heartbeat/Route @@ -1,336 +1,336 @@ #!/bin/sh # # Route OCF RA. Enables and disables network routes. # # (c) 2008-2010 Florian Haas, Dejan Muhamedagic, # and Linux-HA contributors # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Default values OCF_RESKEY_family_default="detect" : ${OCF_RESKEY_family=${OCF_RESKEY_family_default}} ####################################################################### meta_data() { cat < 1.0 Enables and disables network routes. Supports host and net routes, routes via a gateway address, and routes using specific source addresses. This resource agent is useful if a node's routing table needs to be manipulated based on node role assignment. Consider the following example use case: - One cluster node serves as an IPsec tunnel endpoint. - All other nodes use the IPsec tunnel to reach hosts in a specific remote network. Then, here is how you would implement this scheme making use of the Route resource agent: - Configure an ipsec LSB resource. - Configure a cloned Route OCF resource. - Create an order constraint to ensure that ipsec is started before Route. - Create a colocation constraint between the ipsec and Route resources, to make sure no instance of your cloned Route resource is started on the tunnel endpoint itself. Manages network routes The destination network (or host) to be configured for the route. Specify the netmask suffix in CIDR notation (e.g. "/24"). If no suffix is given, a host route will be created. Specify "0.0.0.0/0" or "default" if you want this resource to set the system default route. Destination network The outgoing network device to use for this route. Outgoing network device The gateway IP address to use for this route. Gateway IP address The source IP address to be configured for the route. Source IP address The routing table to be configured for the route. Routing table The address family to be used for the route ip4 IP version 4 ip6 IP version 6 detect Detect from 'destination' address. Address Family END } ####################################################################### create_route_spec() { # Creates a route specification for use by "ip route (add|del|show)" route_spec="to ${OCF_RESKEY_destination}" if [ -n "${OCF_RESKEY_device}" ]; then route_spec="${route_spec} dev ${OCF_RESKEY_device}" fi if [ -n "${OCF_RESKEY_gateway}" ]; then route_spec="${route_spec} via ${OCF_RESKEY_gateway}" fi if [ -n "${OCF_RESKEY_source}" ]; then route_spec="${route_spec} src ${OCF_RESKEY_source}" fi if [ -n "${OCF_RESKEY_table}" ]; then route_spec="${route_spec} table ${OCF_RESKEY_table}" fi echo "$route_spec" } route_usage() { cat </dev/null 2>&1; then ocf_exit_reason "Network device ${OCF_RESKEY_device} appears not to be available on this system." # OCF_ERR_ARGS prevents the resource from running anywhere at all, # maybe another node has the interface? # OCF_ERR_INSTALLED just prevents starting on this particular node. return $OCF_ERR_INSTALLED fi fi # The following tests must return $OCF_ERR_INSTALLED, but only if # the resource is actually running (i.e., not during probes) if ! ocf_is_probe; then # If a source address has been configured, is it available on # this system? if [ -n "${OCF_RESKEY_source}" ]; then if ! ip address show | grep -w ${OCF_RESKEY_source} >/dev/null 2>&1; then ocf_exit_reason "Source address ${OCF_RESKEY_source} appears not to be available on this system." # same reason as with _device: return $OCF_ERR_INSTALLED fi fi # If a gateway address has been configured, is it reachable? if [ -n "${OCF_RESKEY_gateway}" ]; then if ! ip route get ${OCF_RESKEY_gateway} >/dev/null 2>&1; then ocf_exit_reason "Gateway address ${OCF_RESKEY_gateway} is unreachable." # same reason as with _device: return $OCF_ERR_INSTALLED fi fi fi return $OCF_SUCCESS } # These two actions must always succeed case $__OCF_ACTION in meta-data) meta_data # OCF variables are not set when querying meta-data exit 0 ;; usage|help) route_usage exit $OCF_SUCCESS ;; esac # Don't do anything if the necessary utilities aren't present for binary in ip grep; do check_binary $binary done route_validate || exit $? case $OCF_RESKEY_family in ip4) addr_family="-4" ;; ip6) addr_family="-6" ;; detect) case $OCF_RESKEY_destination in *:*) addr_family="-6" ;; *.*) addr_family="-4" ;; *) ocf_exit_reason "Address family detection requires a numeric destination address." ;; esac ;; *) ocf_exit_reason "Address family '${OCF_RESKEY_family}' not recognized." ;; esac case $__OCF_ACTION in start) route_start;; stop) route_stop;; status|monitor) route_status;; reload) ocf_log info "Reloading..." route_start ;; validate-all) ;; *) route_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac rc=$? ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION returned $rc" exit $rc diff --git a/heartbeat/SAPInstance b/heartbeat/SAPInstance index d125cfe93..8cfaf22e7 100755 --- a/heartbeat/SAPInstance +++ b/heartbeat/SAPInstance @@ -1,968 +1,968 @@ #!/bin/sh # # SAPInstance # # Description: Manages a single SAP Instance as a High-Availability # resource. One SAP Instance is defined by one # SAP Instance-Profile. start/stop handles all services # of the START-Profile, status and monitor care only # about essential services. # # Author: Alexander Krauth, June 2006 # Support: linux@sap.com # License: GNU General Public License (GPL) # Copyright: (c) 2006-2008 Alexander Krauth # # An example usage: # See usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_InstanceName # OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) # OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_START_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_START_WAITTIME (optional, to solve timing problems during J2EE-Addin start) # OCF_RESKEY_AUTOMATIC_RECOVER (optional, automatic startup recovery using cleanipc, default is false) # OCF_RESKEY_MONITOR_SERVICES (optional, default is to monitor critical services only) # OCF_RESKEY_SHUTDOWN_METHOD (optional, defaults to NORMAL, KILL: terminate the SAP instance with OS commands - faster, at your own risk) # OCF_RESKEY_ERS_InstanceName (optional, InstanceName of the ERS instance in a Master/Slave configuration) # OCF_RESKEY_ERS_START_PROFILE (optional, START_PROFILE of the ERS instance in a Master/Slave configuration) # OCF_RESKEY_PRE_START_USEREXIT (optional, lists a script which can be executed before the resource is started) # OCF_RESKEY_POST_START_USEREXIT (optional, lists a script which can be executed after the resource is started) # OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be executed before the resource is stopped) # OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be executed after the resource is stopped) # OCF_RESKEY_IS_ERS (needed for ENQ/REPL NW 740) # # TODO: - Option to shutdown sapstartsrv for non-active instances -> that means: do probes only with OS tools (sapinstance_status) # - Option for better standalone enqueue server monitoring, using ensmon (test enque-deque) # - Option for cleanup abandoned enqueue replication tables # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### SH=/bin/sh sapinstance_usage() { methods=`sapinstance_methods` methods=`echo $methods | tr ' ' '|'` cat <<-EOF usage: $0 ($methods) $0 manages a SAP Instance as an HA resource. The 'start' operation starts the instance or the ERS instance in a Master/Slave configuration The 'stop' operation stops the instance The 'status' operation reports whether the instance is running The 'monitor' operation reports whether the instance seems to be working The 'promote' operation starts the primary instance in a Master/Slave configuration The 'demote' operation stops the primary instance and starts the ERS instance The 'notify' operation always returns SUCCESS The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports EOF } sapinstance_meta_data() { cat < 2.14 Usually a SAP system consists of one database and at least one or more SAP instances (sometimes called application servers). One SAP Instance is defined by having exactly one instance profile. The instance profiles can usually be found in the directory /sapmnt/SID/profile. Each instance must be configured as it's own resource in the cluster configuration. The resource agent supports the following SAP versions: - SAP WebAS ABAP Release 6.20 - 7.40 - SAP WebAS Java Release 6.40 - 7.40 - SAP WebAS ABAP + Java Add-In Release 6.20 - 7.40 (Java is not monitored by the cluster in that case) When using a SAP Kernel 6.40 please check and implement the actions from the section "Manual postprocessing" from SAP note 995116 (http://sdn.sap.com). Other versions may also work with this agent, but have not been verified. All operations of the SAPInstance resource agent are done by using the startup framework called SAP Management Console or sapstartsrv that was introduced with SAP kernel release 6.40. Find more information about the SAP Management Console in SAP note 1014480. Using this framework defines a clear interface for the Heartbeat cluster, how it sees the SAP system. The options for monitoring the SAP system are also much better than other methods like just watching the ps command for running processes or doing some pings to the application. sapstartsrv uses SOAP messages to request the status of running SAP processes. Therefore it can actually ask a process itself what it's status is, independent from other problems that might exist at the same time. sapstartsrv knows 4 status colours: - GREEN = everything is fine - YELLOW = something is wrong, but the service is still working - RED = the service does not work - GRAY = the service has not been started The SAPInstance resource agent will interpret GREEN and YELLOW as OK. That means that minor problems will not be reported to the Heartbeat cluster. This prevents the cluster from doing an unwanted failover. The statuses RED and GRAY are reported as NOT_RUNNING to the cluster. Depending on the status the cluster expects from the resource, it will do a restart, failover or just nothing. Manages a SAP instance as an HA resource. The full qualified SAP instance name. e.g. P01_DVEBMGS00_sapp01ci. Usually this is the name of the SAP instance profile. Instance name: SID_INSTANCE_VIR-HOSTNAME The full qualified path where to find sapstartsrv and sapcontrol. Specify this parameter, if you have changed the SAP kernel directory location after the default SAP installation. Path of sapstartsrv and sapcontrol The full qualified path where to find the SAP START profile. Specify this parameter, if you have changed the SAP profile directory location after the default SAP installation. Path of start profile The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than. Start profile name After that time in seconds a monitor operation is executed by the resource agent. Does the monitor return SUCCESS, the start ishandled as SUCCESS. This is useful to resolve timing problems with e.g. the J2EE-Addin instance.Usually the resource agent waits until all services are started and the SAP Management Console reports a GREEN status. A double stack installation (ABAP + Java AddIn) consists of an ABAP dispatcher and a JAVA instance. Normally the start of the JAVA instance takes much longer than the start of the ABAP instance. For a JAVA Instance you may need to configure a much higher timeout for the start operation of the resource in Heartbeat. The disadvantage here is, that the discovery of a failed start by the cluster takes longer. Somebody might say: For me it is important, that the ABAP instance is up and running. A failure of the JAVA instance shall not cause a failover of the SAP instance. Actually the SAP MC reports a YELLOW status, if the JAVA instance of a double stack system fails. From the resource agent point of view YELLOW means:everything is OK. Setting START_WAITTIME to a lower value determines the resource agent to check the status of the instance during a start operation after that time. As it would wait normally for a GREEN status, now it reports SUCCESS to the cluster in case of a YELLOW status already after the specified time. That is only useful for double stack systems. Check the successful start after that time (do not wait for J2EE-Addin) - The SAPInstance resource agent tries to recover a failed start attempt automaticaly one time. This is done by killing runing instance processes, removing the kill.sap file and executing cleanipc. Sometimes a crashed SAP instance leaves some processes and/or shared memory segments behind. Setting this option to true will try to remove those leftovers during a start operation. That is to reduce manual work for the administrator. + The SAPInstance resource agent tries to recover a failed start attempt automatically one time. This is done by killing running instance processes, removing the kill.sap file and executing cleanipc. Sometimes a crashed SAP instance leaves some processes and/or shared memory segments behind. Setting this option to true will try to remove those leftovers during a start operation. That is to reduce manual work for the administrator. Enable or disable automatic startup recovery Within a SAP instance there can be several services. Usually you will find the defined services in the START profile of the related instance (Attention: with SAP Release 7.10 the START profile content was moved to the instance profile). Not all of those services are worth to monitor by the cluster. For example you properly do not like to failover your SAP instance, if the central syslog collector daemon fails. Those services are monitored within the SAPInstance resource agent: - disp+work - msg_server - enserver - enrepserver - jcontrol - jstart Some other services could be monitored as well. They have to be given with the parameter MONITOR_SERVICES, e.g.: - sapwebdisp - TREXDaemon.x That names match the strings used in the output of the command 'sapcontrol -nr [Instance-Nr] -function GetProcessList'. The default should fit most cases where you want to manage a SAP Instance from the cluster. You may change this with this parameter, if you like to monitor more/less or other services that sapstartsrv supports. You may specify multiple services separated by a | (pipe) sign in this parameter: disp+work|msg_server|enserver Services to monitor Usual a SAP Instance is stopped by the command 'sapcontrol -nr InstanceNr -function Stop'. SHUTDOWN_METHOD=KILL means to kill the SAP Instance using OS commands. SAP processes of the instance are terminated with 'kill -9', shared memory is deleted with 'cleanipc' and the 'kill.sap' file will be deleted. That method is much faster than the gracefull stop, but the instance does not have the chance to say goodbye to other SAPinstances in the same system. USE AT YOUR OWN RISK !! Shutdown graceful or kill a SAP instance by terminating the processes. (normal|KILL) Only used in a Master/Slave resource configuration: The full qualified SAP enqueue replication instance name. e.g. P01_ERS02_sapp01ers. Usually this is the name of the SAP instance profile. The enqueue replication instance must be installed, before you want to configure a master-slave cluster recource. The master-slave configuration in the cluster must use this properties: clone_max = 2 clone_node_max = 1 master_node_max = 1 master_max = 1 Enqueue replication instance name: SID_INSTANCE_VIR-HOSTNAME Only used in a Master/Slave resource configuration: The parameter ERS_InstanceName must also be set in this configuration. The name of the SAP START profile. Specify this parameter, if you have changed the name of the SAP START profile after the default SAP installation. As SAP release 7.10 does not have a START profile anymore, you need to specify the Instance Profile than. Enqueue replication start profile name The full qualified path where to find a script or program which should be executed before this resource gets started. Path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got started. Path to a post-start script The full qualified path where to find a script or program which should be executed before this resource gets stopped. Path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got stopped. Path to a post-start script Only used for ASCS/ERS SAP Netweaver installations without implementing a master/slave resource to - allow the ASCS to 'find' the ERS running on an other cluster node after a resource failure. This parameter should be set + allow the ASCS to 'find' the ERS running on another cluster node after a resource failure. This parameter should be set to true 'only' for the ERS instance for implementations following the SAP NetWeaver 7.40 HA certification (NW-HA-CLU-740). This includes also systems for NetWeaver less than 7.40, if you like to impelemnt the NW-HA-CLU-740 scenario. Mark SAPInstance as ERS instance END } # # methods: What methods/operations do we support? # sapinstance_methods() { cat <<-EOF start stop status monitor promote demote notify validate-all methods meta-data usage EOF } # # is_clone : find out if we are configured to run in a Master/Slave configuration # is_clone() { if [ -n "$OCF_RESKEY_CRM_meta_clone_max" ] \ && [ "$OCF_RESKEY_CRM_meta_clone_max" -gt 0 ] then if [ "$OCF_RESKEY_CRM_meta_clone_max" -ne 2 ] || \ [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] || \ [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] || \ [ "$OCF_RESKEY_CRM_meta_master_max" -ne 1 ] then ocf_log err "Clone options misconfigured. (expect: clone_max=2,clone_node_max=1,master_node_max=1,master_max=1)" exit $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_ERS_InstanceName" ] then ocf_log err "In a Master/Slave configuration the ERS_InstanceName parameter is mandatory." exit $OCF_ERR_ARGS fi else return 0 fi return 1 } # # abnormal_end : essential things are missing, but in the natur of a SAP installation - which can be very different # from customer to customer - we cannot handle this always as an error # This would be the case, if the software is installed on shared disks and not visible # to all cluster nodes at all times. # abnormal_end() { local err_msg=$1 ocf_is_probe && { sapinstance_status exit $? } if [ "$ACTION" = "stop" ] then cleanup_instance exit $OCF_SUCCESS fi ocf_log err $err_msg exit $OCF_ERR_CONFIGURED } # # sapinstance_init : Define global variables with default values, if optional parameters are not set # # sapinstance_init() { local myInstanceName="$1" SID=`echo "$myInstanceName" | cut -d_ -f1` InstanceName=`echo "$myInstanceName" | cut -d_ -f2` InstanceNr=`echo "$InstanceName" | sed 's/.*\([0-9][0-9]\)$/\1/'` SAPVIRHOST=`echo "$myInstanceName" | cut -d_ -f3` # optional OCF parameters, we try to guess which directories are correct if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] then if have_binary /usr/sap/$SID/$InstanceName/exe/sapstartsrv && have_binary /usr/sap/$SID/$InstanceName/exe/sapcontrol then DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" SAPSTARTSRV="/usr/sap/$SID/$InstanceName/exe/sapstartsrv" SAPCONTROL="/usr/sap/$SID/$InstanceName/exe/sapcontrol" elif have_binary /usr/sap/$SID/SYS/exe/run/sapstartsrv && have_binary /usr/sap/$SID/SYS/exe/run/sapcontrol then DIR_EXECUTABLE="/usr/sap/$SID/SYS/exe/run" SAPSTARTSRV="/usr/sap/$SID/SYS/exe/run/sapstartsrv" SAPCONTROL="/usr/sap/$SID/SYS/exe/run/sapcontrol" fi else if have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" && have_binary "$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" then DIR_EXECUTABLE="$OCF_RESKEY_DIR_EXECUTABLE" SAPSTARTSRV="$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" SAPCONTROL="$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" fi fi sidadm="`echo $SID | tr '[:upper:]' '[:lower:]'`adm" [ -z "$DIR_EXECUTABLE" ] && abnormal_end "Cannot find sapstartsrv and sapcontrol executable, please set DIR_EXECUTABLE parameter!" if [ -z "$OCF_RESKEY_DIR_PROFILE" ] then DIR_PROFILE="/usr/sap/$SID/SYS/profile" else DIR_PROFILE="$OCF_RESKEY_DIR_PROFILE" fi if [ "$myInstanceName" != "$OCF_RESKEY_InstanceName" ] then currentSTART_PROFILE=$OCF_RESKEY_ERS_START_PROFILE else currentSTART_PROFILE=$OCF_RESKEY_START_PROFILE fi if [ -z "$OCF_RESKEY_IS_ERS" ]; then is_ers="no" else is_ers="$OCF_RESKEY_IS_ERS" fi if [ -z "$currentSTART_PROFILE" ] then SAPSTARTPROFILE="$DIR_PROFILE/START_${InstanceName}_${SAPVIRHOST}" else SAPSTARTPROFILE="$currentSTART_PROFILE" fi if [ -z "$OCF_RESKEY_START_WAITTIME" ] then export OCF_RESKEY_START_WAITTIME=3600 fi if [ -z "$OCF_RESKEY_MONITOR_SERVICES" ] then export OCF_RESKEY_MONITOR_SERVICES="disp+work|msg_server|enserver|enrepserver|jcontrol|jstart" fi # as root user we need the library path to the SAP kernel to be able to call sapcontrol if [ `echo $LD_LIBRARY_PATH | grep -c "^$DIR_EXECUTABLE\>"` -eq 0 ]; then LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH export LD_LIBRARY_PATH fi return $OCF_SUCCESS } # # check_sapstartsrv : Before using sapcontrol we make sure that the sapstartsrv is running for the correct instance. # We cannot use sapinit and the /usr/sap/sapservices file in case of an enquerep instance, # because then we have two instances with the same instance number. # check_sapstartsrv() { local restart=0 local runninginst="" local chkrc=$OCF_SUCCESS local output="" if [ ! -S /tmp/.sapstream5${InstanceNr}13 ]; then ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName (no UDS), it will be started now" restart=1 else output=`$SAPCONTROL -nr $InstanceNr -function ParameterValue INSTANCE_NAME -format script` if [ $? -eq 0 ] then runninginst=`echo "$output" | grep '^0 : ' | cut -d' ' -f3` if [ "$runninginst" != "$InstanceName" ] then ocf_log warn "sapstartsrv is running for instance $runninginst, that service will be killed" restart=1 else output=`$SAPCONTROL -nr $InstanceNr -function AccessCheck Start` if [ $? -ne 0 ]; then ocf_log warn "FAILED : sapcontrol -nr $InstanceNr -function AccessCheck Start (`ls -ld1 /tmp/.sapstream5${InstanceNr}13`)" ocf_log warn "sapstartsrv will be restarted to try to solve this situation, otherwise please check sapstsartsrv setup (SAP Note 927637)" restart=1 fi fi else ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName, it will be started now" restart=1 fi fi if [ -z "$runninginst" ]; then runninginst=$InstanceName; fi if [ $restart -eq 1 ] then if [ -d /usr/sap/$SID/SYS/profile/ ] then DIR_PROFILE="/usr/sap/$SID/SYS/profile" else abnormal_end "Expected /usr/sap/$SID/SYS/profile/ to be a directory, please set DIR_PROFILE parameter!" fi [ ! -r $SAPSTARTPROFILE ] && abnormal_end "Expected $SAPSTARTPROFILE to be the instance START profile, please set START_PROFILE parameter!" pkill -9 -f "sapstartsrv.*$runninginst" # removing the unix domain socket files as they might have wrong permissions # or ownership - they will be recreated by sapstartsrv during next start rm -f /tmp/.sapstream5${InstanceNr}13 rm -f /tmp/.sapstream5${InstanceNr}14 $SAPSTARTSRV pf=$SAPSTARTPROFILE -D -u $sidadm # now make sure the daemon has been started and is able to respond local srvrc=1 while [ $srvrc -eq 1 -a `pgrep -f "sapstartsrv.*$runninginst" | wc -l` -gt 0 ] do sleep 1 $SAPCONTROL -nr $InstanceNr -function GetProcessList > /dev/null 2>&1 srvrc=$? done if [ $srvrc -ne 1 ] then ocf_log info "sapstartsrv for instance $SID-$InstanceName was restarted !" chkrc=$OCF_SUCCESS else ocf_log error "sapstartsrv for instance $SID-$InstanceName could not be started!" chkrc=$OCF_ERR_GENERIC ocf_is_probe && chkrc=$OCF_NOT_RUNNING fi fi return $chkrc } # # sapuserexit : Many SAP customers need some additional processes/tools to run their SAP systems. # This specialties do not allow a totally generic SAP cluster resource agent. # Someone should write a resource agent for each additional process you need, if it # is required to monitor that process within the cluster manager. To enable # you to extent this resource agent without developing a new one, this user exit # was introduced. # sapuserexit() { local NAME="$1" local VALUE="$2" if [ -n "$VALUE" ] then if have_binary "$VALUE" then ocf_log info "Calling userexit ${NAME} with customer script file ${VALUE}" "$VALUE" >/dev/null 2>&1 ocf_log info "Exiting userexit ${NAME} with customer script file ${VALUE}, returncode: $?" else ocf_log warn "Attribute ${NAME} is set to ${VALUE}, but this file is not executable" fi fi return 0 } # # cleanup_instance : remove resources (processes and shared memory) from a crashed instance) # cleanup_instance() { pkill -9 -f -U $sidadm $InstanceName ocf_log info "Terminated instance using 'pkill -9 -f -U $sidadm $InstanceName'" # it is necessary to call cleanipc as user sidadm if the system has 'vmcj/enable = ON' set - otherwise SHM-segments in /dev/shm/SAP_ES2* cannot be removed su - $sidadm -c "cleanipc $InstanceNr remove" ocf_log info "Tried to remove shared memory resources using 'cleanipc $InstanceNr remove' as user $sidadm" ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/kill.sap ocf_run rm -fv /usr/sap/$SID/$InstanceName/work/shutdown.sap ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgcpid ocf_run rm -fv /usr/sap/$SID/$InstanceName/data/rslgspid return 0 } # # sapinstance_start : Start the SAP instance # sapinstance_start() { sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT" local rc=$OCF_NOT_RUNNING local output="" local loopcount=0 while [ $loopcount -lt 2 ] do loopcount=$(($loopcount + 1)) check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ]; then output=`$SAPCONTROL -nr $InstanceNr -function Start` rc=$? ocf_log info "Starting SAP Instance $SID-$InstanceName: $output" fi if [ $rc -ne 0 ] then ocf_log err "SAP Instance $SID-$InstanceName start failed." return $OCF_ERR_GENERIC fi local startrc=1 while [ $startrc -gt 0 ] do local waittime_start=`date +%s` output=`$SAPCONTROL -nr $InstanceNr -function WaitforStarted $OCF_RESKEY_START_WAITTIME 10` startrc=$? local waittime_stop=`date +%s` if [ $startrc -ne 0 ] then if [ $(($waittime_stop - $waittime_start)) -ge $OCF_RESKEY_START_WAITTIME ] then sapinstance_monitor NOLOG if [ $? -eq $OCF_SUCCESS ] then output="START_WAITTIME ($OCF_RESKEY_START_WAITTIME) has elapsed, but instance monitor returned SUCCESS. Instance considered running." startrc=0; loopcount=2 fi else if [ $loopcount -eq 1 ] && ocf_is_true $OCF_RESKEY_AUTOMATIC_RECOVER then ocf_log warn "SAP Instance $SID-$InstanceName start failed: $output" ocf_log warn "Try to recover $SID-$InstanceName" cleanup_instance else loopcount=2 fi startrc=-1 fi else loopcount=2 fi done done if [ $startrc -eq 0 ] then ocf_log info "SAP Instance $SID-$InstanceName started: $output" rc=$OCF_SUCCESS sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT" if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 1 -l reboot; fi else ocf_log err "SAP Instance $SID-$InstanceName start failed: $output" rc=$OCF_NOT_RUNNING if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 0 -l reboot; fi fi return $rc } # # sapinstance_recover: Try startup of failed instance by cleaning up resources # sapinstance_recover() { cleanup_instance sapinstance_start return $? } # # sapinstance_stop: Stop the SAP instance # sapinstance_stop() { local output="" local rc sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT" if [ "$OCF_RESKEY_SHUTDOWN_METHOD" = "KILL" ] then ocf_log info "Stopping SAP Instance $SID-$InstanceName with shutdown method KILL!" cleanup_instance return $OCF_SUCCESS fi check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ]; then output=`$SAPCONTROL -nr $InstanceNr -function Stop` rc=$? ocf_log info "Stopping SAP Instance $SID-$InstanceName: $output" fi if [ $rc -eq 0 ] then output=`$SAPCONTROL -nr $InstanceNr -function WaitforStopped 3600 1` if [ $? -eq 0 ] then ocf_log info "SAP Instance $SID-$InstanceName stopped: $output" rc=$OCF_SUCCESS else ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" rc=$OCF_ERR_GENERIC fi else ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" rc=$OCF_ERR_GENERIC fi sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT" if ocf_is_true $is_ers; then crm_attribute -n runs_ers_${SID} -v 0 -l reboot; fi return $rc } # # sapinstance_monitor: Can the given SAP instance do anything useful? # sapinstance_monitor() { local MONLOG=$1 local rc check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ] then local count=0 local SERVNO local output output=`$SAPCONTROL -nr $InstanceNr -function GetProcessList -format script` # we have to parse the output, because the returncode doesn't tell anything about the instance status for SERVNO in `echo "$output" | grep '^[0-9] ' | cut -d' ' -f1 | sort -u` do local COLOR=`echo "$output" | grep "^$SERVNO dispstatus: " | cut -d' ' -f3` local SERVICE=`echo "$output" | grep "^$SERVNO name: " | cut -d' ' -f3` local STATE=0 local SEARCH case $COLOR in GREEN|YELLOW) STATE=$OCF_SUCCESS;; *) STATE=$OCF_NOT_RUNNING;; esac SEARCH=`echo "$OCF_RESKEY_MONITOR_SERVICES" | sed 's/\+/\\\+/g' | sed 's/\./\\\./g'` if [ `echo "$SERVICE" | egrep -c "$SEARCH"` -eq 1 ] then if [ $STATE -eq $OCF_NOT_RUNNING ] then [ "$MONLOG" != "NOLOG" ] && ocf_log err "SAP instance service $SERVICE is not running with status $COLOR !" rc=$STATE fi count=1 fi done if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ] then if ocf_is_probe then rc=$OCF_NOT_RUNNING else [ "$MONLOG" != "NOLOG" ] && ocf_log err "The SAP instance does not run any services which this RA could monitor!" rc=$OCF_ERR_GENERIC fi fi fi return $rc } # # sapinstance_status: Lightweight check of SAP instance only with OS tools # sapinstance_status() { local pid local pids [ ! -f "/usr/sap/$SID/$InstanceName/work/kill.sap" ] && return $OCF_NOT_RUNNING pids=`grep '^kill -[0-9]' /usr/sap/$SID/$InstanceName/work/kill.sap | awk '{print $3}'` for pid in $pids do [ `pgrep -f -U $sidadm $InstanceName | grep -c $pid` -gt 0 ] && return $OCF_SUCCESS done return $OCF_NOT_RUNNING } # # sapinstance_validate: Check the semantics of the input parameters # sapinstance_validate() { local rc=$OCF_SUCCESS if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$SID' is not a valid system ID!" rc=$OCF_ERR_ARGS fi if [ `echo "$InstanceName" | grep -c '^[A-Z].*[0-9][0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$InstanceName' is not a valid instance name!" rc=$OCF_ERR_ARGS fi if [ `echo "$InstanceNr" | grep -c '^[0-9][0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$InstanceNr' is not a valid instance number!" rc=$OCF_ERR_ARGS fi if [ `echo "$SAPVIRHOST" | grep -c '^[A-Za-z][A-Za-z0-9_-]*$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$SAPVIRHOST' is not a valid hostname!" rc=$OCF_ERR_ARGS fi return $rc } # # sapinstance_start_clone # sapinstance_start_clone() { sapinstance_init $OCF_RESKEY_ERS_InstanceName ${HA_SBIN_DIR}/crm_master -v 50 -l reboot sapinstance_start return $? } # # sapinstance_stop_clone # sapinstance_stop_clone() { sapinstance_init $OCF_RESKEY_ERS_InstanceName ${HA_SBIN_DIR}/crm_master -v 0 -l reboot sapinstance_stop return $? } # # sapinstance_monitor_clone # sapinstance_monitor_clone() { # first check with the status function (OS tools) if there could be something like a SAP instance running # as we do not know here, if we are in master or slave state we do not want to start our monitoring # agents (sapstartsrv) on the wrong host local rc sapinstance_init $OCF_RESKEY_InstanceName if sapinstance_status; then if sapinstance_monitor; then ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot return $OCF_RUNNING_MASTER fi # by nature of the SAP enqueue server we have to make sure # that we do a failover to the slave (enqueue replication server) # in case the enqueue process has failed. We signal this to the # cluster by setting our master preference to a lower value than the slave. ${HA_SBIN_DIR}/crm_master -v 10 -l reboot return $OCF_FAILED_MASTER fi sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_status && sapinstance_monitor rc=$? if [ $rc -eq $OCF_SUCCESS ]; then ${HA_SBIN_DIR}/crm_master -Q -v 100 -l reboot fi return $rc } # # sapinstance_promote_clone: In a Master/Slave configuration get Master by starting the SCS instance and stopping the ERS instance # The order is important here to behave correct from the application levels view # sapinstance_promote_clone() { local rc sapinstance_init $OCF_RESKEY_InstanceName ocf_log info "Promoting $SID-$InstanceName to running Master." sapinstance_start rc=$? if [ $rc -eq $OCF_SUCCESS ]; then sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_stop rc=$? fi return $rc } # # sapinstance_demote_clone: In a Master/Slave configuration get Slave by stopping the SCS instance and starting the ERS instance # sapinstance_demote_clone() { local rc sapinstance_init $OCF_RESKEY_InstanceName ocf_log info "Demoting $SID-$InstanceName to a slave." sapinstance_stop rc=$? if [ $rc -eq $OCF_SUCCESS ]; then sapinstance_init $OCF_RESKEY_ERS_InstanceName sapinstance_start rc=$? fi return $rc } # # sapinstance_notify: Handle master scoring - to make sure a slave gets the next master # sapinstance_notify() { local n_type="$OCF_RESKEY_CRM_meta_notify_type" local n_op="$OCF_RESKEY_CRM_meta_notify_operation" if [ "${n_type}_${n_op}" = "post_promote" ]; then # After promotion of one master in the cluster, we make sure that all clones reset their master # value back to 100. This is because a failed monitor on a master might have degree one clone # instance to score 10. ${HA_SBIN_DIR}/crm_master -v 100 -l reboot elif [ "${n_type}_${n_op}" = "pre_demote" ]; then # if we are a slave and a demote event is announced, make sure we are highest on the list to become master # that is, when a slave resource was started after the promote event of an already running master (e.g. node of slave was down) # We also have to make sure to overrule the globally set resource_stickiness or any fail-count factors => INFINITY local n_uname="$OCF_RESKEY_CRM_meta_notify_demote_uname" if [ ${n_uname} != ${NODENAME} ]; then ${HA_SBIN_DIR}/crm_master -v INFINITY -l reboot fi fi } # # 'main' starts here... # ## GLOBALS SID="" sidadm="" InstanceName="" InstanceNr="" SAPVIRHOST="" DIR_EXECUTABLE="" SAPSTARTSRV="" SAPCONTROL="" DIR_PROFILE="" SAPSTARTPROFILE="" CLONE=0 NODENAME=$(ocf_local_nodename) if ( [ $# -ne 1 ] ) then sapinstance_usage exit $OCF_ERR_ARGS fi ACTION=$1 if [ "$ACTION" = "status" ]; then ACTION=monitor fi # These operations don't require OCF instance parameters to be set case "$ACTION" in usage|methods) sapinstance_$ACTION exit $OCF_SUCCESS;; meta-data) sapinstance_meta_data exit $OCF_SUCCESS;; notify) sapinstance_notify exit $OCF_SUCCESS;; *);; esac if ! ocf_is_root then ocf_log err "$0 must be run as root" exit $OCF_ERR_PERM fi # parameter check if [ -z "$OCF_RESKEY_InstanceName" ] then ocf_log err "Please set OCF_RESKEY_InstanceName to the name to the SAP instance profile!" exit $OCF_ERR_ARGS fi is_clone; CLONE=$? if [ ${CLONE} -eq 1 ] then CLACT=_clone else if [ "$ACTION" = "promote" -o "$ACTION" = "demote" ] then ocf_log err "$ACTION called in a non master/slave environment" exit $OCF_ERR_ARGS fi sapinstance_init $OCF_RESKEY_InstanceName fi # What kind of method was invoked? case "$ACTION" in start|stop|monitor|promote|demote) sapinstance_$ACTION$CLACT exit $?;; validate-all) sapinstance_validate exit $?;; *) sapinstance_methods exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/heartbeat/Squid.in b/heartbeat/Squid.in index a46d9c9e2..b992909f9 100644 --- a/heartbeat/Squid.in +++ b/heartbeat/Squid.in @@ -1,446 +1,446 @@ #!@BASH_SHELL@ # # Description: Manages a Squid Server provided by NTT OSSC as an # OCF High-Availability resource under Heartbeat/LinuxHA control # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. # # Copyright (c) 2008 NIPPON TELEGRAPH AND TELEPHONE CORPORATION # ####################################################################### # OCF parameters: # OCF_RESKEY_squid_exe : Executable file # OCF_RESKEY_squid_conf : Configuration file # OCF_RESKEY_squid_pidfile: Process id file # OCF_RESKEY_squid_port : Port number # OCF_RESKEY_debug_mode : Debug mode # OCF_RESKEY_debug_log : Debug log file # OCF_RESKEY_squid_stop_timeout: # Number of seconds to await to confirm a # normal stop method # # OCF_RESKEY_squid_exe, OCF_RESKEY_squid_conf, OCF_RESKEY_squid_pidfile # and OCF_RESKEY_squid_port must be specified. Each of the rests # has its default value or refers OCF_RESKEY_squid_conf to make # its value when no explicit value is given. ############################################################################### : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs usage() { cat <<-! usage: $0 action action: start : start a new squid instance stop : stop the running squid instance status : return the status of squid, run or down monitor : return TRUE if the squid appears to be working. meta-data : show meta data message validate-all: validate the instance parameters ! return $OCF_ERR_ARGS } metadata_squid() { cat < 1.0 The resource agent of Squid. This manages a Squid instance as an HA resource. Manages a Squid proxy server instance This is a required parameter. This parameter specifies squid's executable file. Executable file This is a required parameter. This parameter specifies a configuration file for a squid instance managed by this RA. Configuration file This is a required parameter. This parameter specifies a process id file for a squid instance managed by this RA. Pidfile This is a required parameter. This parameter specifies a port number -for a squid instance managed by this RA. If plural ports are used, -you must specifiy the only one of them. +for a squid instance managed by this RA. If multiple ports are used, +you must specify only one of them. Port number On stop, a squid shutdown is invoked first. If the resource doesn't stop within this timeout, we resort to stopping processes by sending signals and finally KILLing them. how long to wait for squid shutdown to stop the instance before resorting to kill This is an optional parameter. This RA runs in debug mode when this parameter includes 'x' or 'v'. If 'x' is included, both of STDOUT and STDERR redirect to the logfile specified by "debug_log", and then the builtin shell option 'x' is turned on. It is similar about 'v'. Debug mode This is an optional and omittable parameter. This parameter specifies a destination file for debug logs and works only if this RA run in debug mode. Refer to "debug_mode" about debug mode. If no value is given but it's requied, it's made by the following rules: "/var/log/" as a directory part, the basename of the configuration file given by "syslog_ng_conf" as a basename part, ".log" as a suffix. A destination of the debug log END return $OCF_SUCCESS } get_pids() { SQUID_PIDS=( ) # Seek by pattern SQUID_PIDS[0]=$(pgrep -f "$PROCESS_PATTERN") # Seek by pidfile SQUID_PIDS[1]=$(awk '1{print $1}' $SQUID_PIDFILE 2>/dev/null) if [[ -n "${SQUID_PIDS[1]}" ]]; then typeset exe exe=$(ls -l "/proc/${SQUID_PIDS[1]}/exe") if [[ $? = 0 ]]; then exe=${exe##*-> } if ! [[ "$exe" = $SQUID_EXE ]]; then SQUID_PIDS[1]="" fi else SQUID_PIDS[1]="" fi fi # Seek by port SQUID_PIDS[2]=$( netstat -apn | awk '/tcp.*:'$SQUID_PORT' .*LISTEN/ && $7~/^[1-9]/ { sub("\\/.*", "", $7); print $7; exit}') } are_all_pids_found() { if [[ -n "${SQUID_PIDS[0]}" ]] && [[ -n "${SQUID_PIDS[1]}" ]] && [[ -n "${SQUID_PIDS[2]}" ]] then return 0 else return 1 fi } are_pids_sane() { if [[ "${SQUID_PIDS[1]}" = "${SQUID_PIDS[2]}" ]]; then return $OCF_SUCCESS else ocf_exit_reason "$SQUID_NAME:Pid unmatch" return $OCF_ERR_GENERIC fi } is_squid_dead() { if [[ -z "${SQUID_PIDS[0]}" ]] && [[ -z "${SQUID_PIDS[2]}" ]] then return 0 else return 1 fi } monitor_squid() { typeset trialcount=0 while true; do get_pids if are_all_pids_found; then are_pids_sane return $OCF_SUCCESS fi if is_squid_dead; then return $OCF_NOT_RUNNING fi ocf_log info "$SQUID_NAME:Inconsistent processes:" \ "${SQUID_PIDS[0]},${SQUID_PIDS[1]},${SQUID_PIDS[2]}" (( trialcount = trialcount + 1 )) if (( trialcount > SQUID_CONFIRM_TRIALCOUNT )); then ocf_exit_reason "$SQUID_NAME:Inconsistency of processes remains unsolved" return $OCF_ERR_GENERIC fi sleep 1 done } start_squid() { typeset status monitor_squid status=$? if [[ $status != $OCF_NOT_RUNNING ]]; then return $status fi set -- "$SQUID_OPTS" ocf_run $SQUID_EXE -f "$SQUID_CONF" "$@" status=$? if [[ $status != $OCF_SUCCESS ]]; then return $OCF_ERR_GENERIC fi while true; do get_pids if are_all_pids_found && are_pids_sane; then return $OCF_SUCCESS fi ocf_log info "$SQUID_NAME:Waiting for squid to be invoked" sleep 1 done return $OCF_ERR_GENERIC } stop_squid() { typeset lapse_sec if ocf_run $SQUID_EXE -f $SQUID_CONF -k shutdown; then lapse_sec=0 while true; do get_pids if is_squid_dead; then rm -f $SQUID_PIDFILE return $OCF_SUCCESS fi (( lapse_sec = lapse_sec + 1 )) if (( lapse_sec > SQUID_STOP_TIMEOUT )); then break fi sleep 1 ocf_log info "$SQUID_NAME:$FUNCNAME:$LINENO: " \ "stop NORM $lapse_sec/$SQUID_STOP_TIMEOUT" done fi while true; do get_pids ocf_log info "$SQUID_NAME:$FUNCNAME:$LINENO: " \ "try to stop by SIGKILL:${SQUID_PIDS[0]} ${SQUID_PIDS[2]}" kill -KILL ${SQUID_PIDS[0]} ${SQUID_PIDS[2]} sleep 1 if is_squid_dead; then rm -f $SQUID_PIDFILE return $OCF_SUCCESS fi done return $OCF_ERR_GENERIC } status_squid() { return $OCF_SUCCESS } validate_all_squid() { ocf_log info "validate_all_squid[$SQUID_NAME]" return $OCF_SUCCESS } : === Debug ${0##*/} $1 === if [[ "$1" = "meta-data" ]]; then metadata_squid exit $? fi SQUID_CONF="${OCF_RESKEY_squid_conf}" if [[ -z "$SQUID_CONF" ]]; then ocf_exit_reason "SQUID_CONF is not defined" exit $OCF_ERR_CONFIGURED fi SQUID_NAME="${SQUID_CONF##*/}" SQUID_NAME="${SQUID_NAME%.*}" DEBUG_LOG="${OCF_RESKEY_debug_log-/var/log/squid_${SQUID_NAME}_debug}.log" DEBUG_MODE="" case $OCF_RESKEY_debug_mode in *x*) DEBUG_MODE="${DEBUG_MODE}x";; esac case $OCF_RESKEY_debug_mode in *v*) DEBUG_MODE="${DEBUG_MODE}v";; esac if [ -n "$DEBUG_MODE" ]; then PS4='\d \t \h '"${1-unknown} " export PS4 exec 1>>$DEBUG_LOG 2>&1 set -$DEBUG_MODE fi SQUID_EXE="${OCF_RESKEY_squid_exe}" if [[ -z "$SQUID_EXE" ]]; then ocf_exit_reason "SQUID_EXE is not defined" exit $OCF_ERR_CONFIGURED fi if [[ ! -x "$SQUID_EXE" ]]; then ocf_exit_reason "$SQUID_EXE is not found" exit $OCF_ERR_CONFIGURED fi SQUID_PIDFILE="${OCF_RESKEY_squid_pidfile}" if [[ -z "$SQUID_PIDFILE" ]]; then ocf_exit_reason "SQUID_PIDFILE is not defined" exit $OCF_ERR_CONFIGURED fi SQUID_PORT="${OCF_RESKEY_squid_port}" if [[ -z "$SQUID_PORT" ]]; then ocf_exit_reason "SQUID_PORT is not defined" exit $OCF_ERR_CONFIGURED fi SQUID_OPTS="${OCF_RESKEY_squid_opts}" SQUID_PIDS=( ) SQUID_CONFIRM_TRIALCOUNT="${OCF_RESKEY_squid_confirm_trialcount-3}" SQUID_STOP_TIMEOUT="${OCF_RESKEY_squid_stop_timeout-10}" SQUID_SUSPEND_TRIALCOUNT="${OCF_RESKEY_squid_suspend_trialcount-10}" PROCESS_PATTERN="$SQUID_EXE -f $SQUID_CONF" COMMAND=$1 case "$COMMAND" in start) ocf_log debug "[$SQUID_NAME] Enter squid start" start_squid func_status=$? ocf_log debug "[$SQUID_NAME] Leave squid start $func_status" exit $func_status ;; stop) ocf_log debug "[$SQUID_NAME] Enter squid stop" stop_squid func_status=$? ocf_log debug "[$SQUID_NAME] Leave squid stop $func_status" exit $func_status ;; status) status_squid exit $? ;; monitor) #ocf_log debug "[$SQUID_NAME] Enter squid monitor" monitor_squid func_status=$? #ocf_log debug "[$SQUID_NAME] Leave squid monitor $func_status" exit $func_status ;; validate-all) validate_all_squid exit $? ;; *) usage ;; esac # vim: set sw=4 ts=4 : diff --git a/heartbeat/Stateful b/heartbeat/Stateful index 809db4da9..894945f32 100755 --- a/heartbeat/Stateful +++ b/heartbeat/Stateful @@ -1,189 +1,189 @@ #!/bin/sh # # # Example of a stateful OCF Resource Agent. # # Copyright (c) 2006 Andrew Beekhof # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot" ####################################################################### meta_data() { cat < 1.0 -This is an example resource agent that impliments two states +This is an example resource agent that implements two states Example stateful resource agent Location to store the resource state in State file END exit $OCF_SUCCESS } ####################################################################### stateful_usage() { cat < ${OCF_RESKEY_state} } stateful_check_state() { target=$1 if [ -f ${OCF_RESKEY_state} ]; then state=`cat ${OCF_RESKEY_state}` if [ "x$target" = "x$state" ]; then return $OCF_SUCCESS fi else if [ "x$target" = "x" ]; then return $OCF_SUCCESS fi fi return $OCF_ERR_GENERIC } stateful_start() { stateful_check_state master if [ $? = 0 ]; then # CRM Error - Should never happen return $OCF_RUNNING_MASTER fi stateful_update slave $CRM_MASTER -v 5 return $OCF_SUCCESS } stateful_demote() { stateful_check_state if [ $? = 0 ]; then # CRM Error - Should never happen return $OCF_NOT_RUNNING fi stateful_update slave $CRM_MASTER -v 5 return $OCF_SUCCESS } stateful_promote() { stateful_check_state if [ $? = 0 ]; then return $OCF_NOT_RUNNING fi stateful_update master $CRM_MASTER -v 10 return $OCF_SUCCESS } stateful_stop() { $CRM_MASTER -D stateful_check_state master if [ $? = 0 ]; then # CRM Error - Should never happen return $OCF_RUNNING_MASTER fi if [ -f ${OCF_RESKEY_state} ]; then rm ${OCF_RESKEY_state} fi return $OCF_SUCCESS } stateful_monitor() { stateful_check_state "master" if [ $? = 0 ]; then return $OCF_RUNNING_MASTER fi stateful_check_state "slave" if [ $? = 0 ]; then return $OCF_SUCCESS fi if [ -f ${OCF_RESKEY_state} ]; then echo "File '${OCF_RESKEY_state}' exists but contains unexpected contents" cat ${OCF_RESKEY_state} return $OCF_ERR_GENERIC fi return $OCF_NOT_RUNNING } stateful_validate() { exit $OCF_SUCCESS } : ${OCF_RESKEY_state=${HA_RSCTMP}/Stateful-${OCF_RESOURCE_INSTANCE}.state} case $__OCF_ACTION in meta-data) meta_data;; start) stateful_start;; promote) stateful_promote;; demote) stateful_demote;; stop) stateful_stop;; monitor) stateful_monitor;; validate-all) stateful_validate;; usage|help) stateful_usage $OCF_SUCCESS;; *) stateful_usage $OCF_ERR_UNIMPLEMENTED;; esac exit $? diff --git a/heartbeat/db2 b/heartbeat/db2 index d24c770de..033005f94 100755 --- a/heartbeat/db2 +++ b/heartbeat/db2 @@ -1,902 +1,902 @@ #!/bin/sh # # db2 # # Resource agent that manages a DB2 LUW database in Standard role # or HADR configuration in master/slave configuration. # Multi partition is supported as well. # # Copyright (c) 2011 Holger Teutsch # # This agent incoporates code of a previous release created by # Alan Robertson and the community. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### db2_usage() { echo "db2 start|stop|monitor|promote|demote|notify|validate-all|meta-data" } db2_meta_data() { cat < 1.0 Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles in master/slave configuration. Multiple partitions are supported. Standard mode: An instance including all or selected databases is made highly available. Configure each partition as a separate primitive resource. HADR mode: A single database in HADR configuration is made highly available by automating takeover operations. Configure a master / slave resource with notifications enabled and an additional monitoring operation with role "Master". In case of HADR be very deliberate in specifying intervals/timeouts. The detection of a failure including promote must complete within HADR_PEER_WINDOW. In addition to honoring requirements for crash recovery etc. for your specific database use the following relations as guidance: "monitor interval" < HADR_PEER_WINDOW - (appr 30 sec) "promote timeout" < HADR_PEER_WINDOW + (appr 20 sec) For further information and examples consult http://www.linux-ha.org/wiki/db2_(resource_agent) Resource Agent that manages an IBM DB2 LUW databases in Standard role as primitive or in HADR roles as master/slave configuration. Multiple partitions are supported. The instance of the database(s). instance List of databases to be managed, e.g "db1 db2". Defaults to all databases in the instance. Specify one db for HADR mode. List of databases to be managed DEPRECATED: The admin user of the instance. DEPRECATED: admin -The number of the partion (DBPARTITIONNUM) to be managed. +The number of the partition (DBPARTITIONNUM) to be managed. database partition number (DBPARTITIONNUM) END } # # validate # .. and set global variables # # exit on error # db2_validate() { local db2home db2sql db2instance # db2 uses korn shell check_binary "ksh" # check required instance vars if [ -z "$OCF_RESKEY_instance" ] then ocf_log err "DB2 required parameter instance is not set!" return $OCF_ERR_CONFIGURED fi instance=$OCF_RESKEY_instance if [ -n "$OCF_RESKEY_admin" ] then ocf_log warn "DB2 deprecated parameter admin is set, using $OCF_RESKEY_admin as instance." instance=$OCF_RESKEY_admin fi db2node=${OCF_RESKEY_dbpartitionnum:-0} db2home=$(sh -c "echo ~$instance") db2sql=$db2home/sqllib db2profile=$db2sql/db2profile db2bin=$db2sql/bin STATE_FILE=${HA_RSCTMP}/db2-${OCF_RESOURCE_INSTANCE}.state # Let's make sure a few important things are there... if ! [ -d "$db2sql" -a -d "$db2bin" -a -f "$db2profile" -a \ -x "$db2profile" -a -x "$db2bin/db2" ] then ocf_is_probe && exit $OCF_NOT_RUNNING ocf_log err "DB2 required directories and/or files not found" exit $OCF_ERR_INSTALLED fi db2instance=$(runasdb2 'echo $DB2INSTANCE') if [ "$db2instance" != "$instance" ] then ocf_is_probe && exit $OCF_NOT_RUNNING ocf_log err "DB2 parameter instance \"$instance\" != DB2INSTANCE \"$db2instance\"" exit $OCF_ERR_CONFIGURED fi # enough checking for stop to succeed [ $__OCF_ACTION = stop ] && return $OCF_SUCCESS dblist=$OCF_RESKEY_dblist if [ -n "$dblist" ] then # support , as separator as well dblist=$(echo "$dblist" | sed -e 's/[,]/ /g') else if ! dblist=$(db2_dblist) then ocf_log err "DB2 $instance($db2node): cannot retrieve db directory" exit $OCF_ERR_INSTALLED fi fi # check requirements for the HADR case if ocf_is_ms then set -- $dblist if [ $# != 1 ] then ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have exactly one name in dblist" exit $OCF_ERR_CONFIGURED fi if [ $db2node != 0 ] then ocf_log err "DB2 resource $OCF_RESOURCE_INSTANCE must have dbpartitionnum=0" exit $OCF_ERR_CONFIGURED fi fi return $OCF_SUCCESS } master_score() { if ! have_binary "crm_master"; then return fi crm_master $* } # # Run the given command as db2 instance user # runasdb2() { su $instance -c ". $db2profile; $*" } # # Run a command as the DB2 admin, and log the output # logasdb2() { local output rc output=$(runasdb2 $*) rc=$? if [ $rc -eq 0 ] then ocf_log info "$output" else ocf_log err "$output" fi return $rc } # # maintain the fal (first active log) attribute # db2_fal_attrib DB {set val|get} # db2_fal_attrib() { local db=$1 local attr val rc id node member me attr=db2hadr_${instance}_${db}_fal case "$2" in set) me=$(uname -n) # loop over all member nodes and set attribute crm_node -l | while read id node member do [ "$member" = member -a "$node" != "$me" ] || continue crm_attribute -t nodes -l reboot --node=$node -n $attr -v "$3" rc=$? ocf_log info "DB2 instance $instance($db2node/$db: setting attrib for FAL to $FIRST_ACTIVE_LOG @ $node" [ $rc != 0 ] && break done ;; get) crm_attribute -t nodes -l reboot -n $attr -G --quiet 2>&1 rc=$? if [ $rc != 0 ] then ocf_log warn "DB2 instance $instance($db2node/$db: can't retrieve attribute $attr, are you sure notifications are enabled ?" fi ;; *) exit $OCF_ERR_CONFIGURED esac return $rc } # # unfortunately a first connect after a crash may need several minutes # for some internal cleanup stuff in DB2. # We run a connect in background so other connects (i.e. monitoring!) may proceed. # db2_run_connect() { local db=$1 logasdb2 "db2 connect to $db; db2 terminate" } # # get some data from the database config # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW # db2_get_cfg() { local db=$1 local output hadr_vars output=$(runasdb2 db2 get db cfg for $db) [ $? != 0 ] && return $OCF_ERR_GENERIC hadr_vars=$(echo "$output" | awk '/HADR database role/ {printf "HADR_ROLE='%s'; ", $NF;} /HADR_TIMEOUT/ {printf "HADR_TIMEOUT='%s'; ", $NF;} /First active log file/ {printf "FIRST_ACTIVE_LOG='%s'\n", $NF;} /HADR_PEER_WINDOW/ {printf "HADR_PEER_WINDOW='%s'\n", $NF;}') # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW eval $hadr_vars # HADR_PEER_WINDOW comes with V9 and is checked later if [ -z "$HADR_ROLE" -o -z "$HADR_TIMEOUT" ] then ocf_log error "DB2 cfg values invalid for $instance($db2node)/$db: $hadr_vars" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # # return the list of databases in the instance # db2_dblist() { local output output=$(runasdb2 db2 list database directory) || return $OCF_ERR_GENERIC echo "$output" | grep -i 'Database name.*=' | sed 's%.*= *%%' } # # Delayed check of the compatibility of DB2 instance and pacemaker # config. # Logically this belongs to validate but certain parameters can only # be retrieved once the instance is started. # db2_check_config_compatibility() { local db=$1 local is_ms ocf_is_ms is_ms=$? case "$HADR_ROLE/$is_ms" in STANDARD/0) ocf_log err "DB2 database $instance/$db is not in a HADR configuration but I am a M/S resource" exit $OCF_ERR_INSTALLED ;; STANDARD/1) # OK ;; */0) if [ -z "$HADR_PEER_WINDOW" ] then ocf_log err "DB2 database $instance: release to old, need HADR_PEER_WINDOW (>=V9)" exit $OCF_ERR_INSTALLED fi ;; */1) ocf_log err "DB2 database $instance/$db is in a HADR configuration but I must be a M/S resource" esac } # # Start instance and DB. # Standard mode is through "db2 activate" in order to start in previous # mode (Standy/Primary). # If the database is a primary AND we can determine that the running master # has a higher "first active log" we conclude that we come up after a crash # an the previous Standby is now Primary. # The db is then started as Standby. # # Other cases: danger of split brain, log error and do nothing. # db2_start() { local output start_cmd db local start_opts="dbpartitionnum $db2node" # If we detect that db partitions are not in use, and no # partition is explicitly specified, activate without # partition information. This allows db2 instances without # partition support to be managed. if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -a "$db2sql/db2nodes.cfg" ]; then start_opts="" fi if output=$(runasdb2 db2start $start_opts) then ocf_log info "DB2 instance $instance($db2node) started: $output" else case $output in *SQL1026N*) ocf_log info "DB2 instance $instance($db2node) already running: $output" ;; *) ocf_log err "$output" return $OCF_ERR_GENERIC esac fi if ! db2_instance_status then ocf_log err "DB2 instance $instance($db2node) is not active!" return $OCF_ERR_GENERIC fi [ $db2node = 0 ] || return $OCF_SUCCESS # activate DB only on node 0 for db in $dblist do # sets HADR_ROLE HADR_TIMEOUT HADR_PEER_WINDOW FIRST_ACTIVE_LOG db2_get_cfg $db || return $? # Better late than never: can only check this when the instance is already up db2_check_config_compatibility $db start_cmd="db2 activate db $db" if [ $HADR_ROLE = PRIMARY ] then local master_fal # communicate our FAL to other nodes the might start concurrently db2_fal_attrib $db set $FIRST_ACTIVE_LOG # ignore false positive: # error: Can't use > in [ ]. Escape it or use [[..]]. [SC2073] # see https://github.com/koalaman/shellcheck/issues/691 # shellcheck disable=SC2073 if master_fal=$(db2_fal_attrib $db get) && [ "$master_fal" '>' $FIRST_ACTIVE_LOG ] then ocf_log info "DB2 database $instance($db2node)/$db is Primary and outdated, starting as secondary" start_cmd="db2 start hadr on db $db as standby" HADR_ROLE=STANDBY fi fi if output=$(runasdb2 $start_cmd) then ocf_log info "DB2 database $instance($db2node)/$db started/activated" [ $HADR_ROLE != STANDBY ] && db2_run_connect $db & else case $output in SQL1490W*|SQL1494W*|SQL1497W*|SQL1777N*) ocf_log info "DB2 database $instance($db2node)/$db already activated: $output" ;; SQL1768N*"Reason code = \"7\""*) ocf_log err "DB2 database $instance($db2node)/$db is a Primary and the Standby is down" ocf_log err "Possible split brain ! Manual intervention required." ocf_log err "If this DB is outdated use \"db2 start hadr on db $db as standby\"" ocf_log err "If this DB is the surviving primary use \"db2 start hadr on db $db as primary by force\"" # might be the Standby is not yet there # might be a timing problem because "First active log" is delayed # on the next start attempt we might succeed when FAL was advanced # might be manual intervention is required # ... so let pacemaker give it another try and we will succeed then return $OCF_ERR_GENERIC ;; *) ocf_log err "DB2 database $instance($db2node)/$db didn't start: $output" return $OCF_ERR_GENERIC esac fi done # come here with success # Even if we are a db2 Primary pacemaker requires start to end up in slave mode echo SLAVE > $STATE_FILE return $OCF_SUCCESS } # # helper function to be spawned # so we can detect a hang of the db2stop command # db2_stop_bg() { local rc output local stop_opts="dbpartitionnum $db2node" rc=$OCF_SUCCESS if [ -z "$OCF_RESKEY_dbpartitionnum" ] && ! [ -a "$db2sql/db2nodes.cfg" ]; then stop_opts="" fi if output=$(runasdb2 db2stop force $stop_opts) then ocf_log info "DB2 instance $instance($db2node) stopped: $output" else case $output in *SQL1032N*) #SQL1032N No start database manager command was issued ocf_log info "$output" ;; *) ocf_log err "DB2 instance $instance($db2node) stop failed: $output" rc=$OCF_ERR_GENERIC esac fi return $rc } # # Stop the given db2 database instance # db2_stop() { local stop_timeout grace_timeout stop_bg_pid i must_kill # remove master score master_score -D -l reboot # be very early here in order to avoid stale data rm -f $STATE_FILE db2_instance_status if [ $? -eq $OCF_NOT_RUNNING ]; then ocf_log info "DB2 instance $instance already stopped" return $OCF_SUCCESS fi stop_timeout=${OCF_RESKEY_CRM_meta_timeout:-20000} # grace_time is 4/5 (unit is ms) grace_timeout=$((stop_timeout/1250)) # start db2stop in background as this may hang db2_stop_bg & stop_bg_pid=$! # wait for grace_timeout i=0 while [ $i -lt $grace_timeout ] do kill -0 $stop_bg_pid 2>/dev/null || break; sleep 1 i=$((i+1)) done # collect exit status but don't hang if kill -0 $stop_bg_pid 2>/dev/null then stoprc=1 kill -9 $stop_bg_pid 2>/dev/null else wait $stop_bg_pid stoprc=$? fi must_kill=0 if [ $stoprc -ne 0 ] then ocf_log warn "DB2 instance $instance($db2node): db2stop failed, using db2nkill" must_kill=1 elif ! db2_instance_dead then ocf_log warn "DB2 instance $instance($db2node): db2stop indicated success but there a still processes, using db2nkill" must_kill=1 fi if [ $must_kill -eq 1 ] then - # db2nkill kills *all* partions on the node + # db2nkill kills *all* partitions on the node if [ -x $db2bin/db2nkill ] then logasdb2 $db2bin/db2nkill $db2node elif [ -x $db2bin/db2_kill ] then logasdb2 $db2bin/db2_kill fi # loop forever (or lrmd kills us due to timeout) until the # instance is dead while ! db2_instance_dead do ocf_log info "DB2 instance $instance($db2node): waiting for processes to exit" sleep 1 done ocf_log info "DB2 instance $instance($db2node) is now dead" fi return $OCF_SUCCESS } # # check whether `enough´ processes for a healthy instance are up # db2_instance_status() { local pscount pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- | grep ' db2[^ ]' | wc -l) if [ $pscount -ge 4 ]; then return $OCF_SUCCESS; elif [ $pscount -ge 1 ]; then return $OCF_ERR_GENERIC fi return $OCF_NOT_RUNNING } # # is the given db2 instance dead? # db2_instance_dead() { local pscount pscount=$(runasdb2 $db2bin/db2nps $db2node | cut -c9- | grep ' db2[^ ]' | wc -l) test $pscount -eq 0 } # # return the status of the db as "Role/Status" # e.g. Primary/Peer, Standby/RemoteCatchupPending # # If not in HADR configuration return "Standard/Standalone" # db2_hadr_status() { local db=$1 local output output=$(runasdb2 db2pd -hadr -db $db) if [ $? != 0 ] then echo "Down/Off" return 1 fi echo "$output" | awk '/^\s+HADR_(ROLE|STATE) =/ {printf $3"/"} /^\s+HADR_CONNECT_STATUS =/ {print $3; exit; } /^HADR is not active/ {print "Standard/Standalone"; exit; } /^Role *State */ {getline; printf "%s/%s\n", $1, $2; exit; }' } # # Monitor the db # And as side effect set crm_master / FAL attribute # db2_monitor() { local CMD output hadr db local rc db2_instance_status rc=$? if [ $rc -ne $OCF_SUCCESS ]; then # instance is dead remove master score master_score -D -l reboot exit $rc fi [ $db2node = 0 ] || return 0 # monitoring only for partition 0 for db in $dblist do hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC ocf_log debug "Monitor: DB2 database $instance($db2node)/$db has HADR status $hadr" # set master preference accordingly case "$hadr" in PRIMARY/*|Primary/*|Standard/*) # perform a basic health check CMD="if db2 connect to $db; then db2 select \* from sysibm.sysversions ; rc=\$?; db2 terminate; else rc=\$?; fi; exit \$rc" if ! output=$(runasdb2 $CMD) then case "$output" in SQL1776N*) # can't connect/select on standby, may be spurious turing takeover ;; *) ocf_log err "DB2 database $instance($db2node)/$db is not working" ocf_log err "DB2 message: $output" # dead primary, remove master score master_score -D -l reboot return $OCF_ERR_GENERIC esac fi ocf_log debug "DB2 database $instance($db2node)/$db appears to be working" ocf_is_ms && master_score -v 10000 -l reboot ;; STANDBY/*PEER/*|Standby/*Peer) master_score -v 8000 -l reboot ;; STANDBY/*|Standby/*) ocf_log warn "DB2 database $instance($db2node)/$db in status $hadr can never be promoted" master_score -D -l reboot ;; *) return $OCF_ERR_GENERIC esac done # everything OK, return if running as slave grep MASTER $STATE_FILE >/dev/null 2>&1 || return $OCF_SUCCESS return $OCF_RUNNING_MASTER } # # Promote db to Primary # db2_promote() { # validate ensured that dblist contains only one entry local db=$dblist local i hadr output force # we run this twice as after a crash of the other node # within HADR_TIMEOUT the status may be still reported as Peer # although a connection no longer exists for i in 1 2 do hadr=$(db2_hadr_status $db) || return $OCF_ERR_GENERIC ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be promoted" case "$hadr" in Standard/Standalone) # this case only to keep ocf-tester happy return $OCF_SUCCESS ;; PRIMARY/PEER/*|PRIMARY/REMOTE_CATCHUP/*|Primary/Peer) # nothing to do, only update pacemaker's view echo MASTER > $STATE_FILE return $OCF_SUCCESS ;; STANDBY/PEER/CONNECTED|Standby/Peer) # must take over ;; STANDBY/*PEER/DISCONNECTED|Standby/DisconnectedPeer) # must take over forced force="by force peer window only" ;; *) return $OCF_ERR_GENERIC esac if output=$(runasdb2 db2 takeover hadr on db $db $force) then # update pacemaker's view echo MASTER > $STATE_FILE # turn the log so we rapidly get a new FAL logasdb2 "db2 archive log for db $db" return $OCF_SUCCESS fi case "$output" in SQL1770N*"Reason code = \"7\""*) # expected, HADR_TIMEOUT is now expired # go for the second try continue ;; *) ocf_log err "DB2 database $instance($db2node)/$db promote failed: $output" return $OCF_ERR_GENERIC esac done return $OCF_ERR_GENERIC } # # Demote db to standby # db2_demote() { # validate ensured that dblist contains only one entry local db=$dblist local hadr # house keeping, set pacemaker's view to slave echo SLAVE > $STATE_FILE hadr=$(db2_hadr_status $dblist) || return $OCF_ERR_GENERIC ocf_log info "DB2 database $instance($db2node)/$db has HADR status $hadr and will be demoted" db2_monitor return $? } # # handle pre start notification # We record our first active log on the other nodes. # If two primaries come up after a crash they can safely determine who is # the outdated one. # db2_notify() { local node # only interested in pre-start [ $OCF_RESKEY_CRM_meta_notify_type = pre \ -a $OCF_RESKEY_CRM_meta_notify_operation = start ] || return $OCF_SUCESS # gets FIRST_ACTIVE_LOG db2_get_cfg $dblist || return $? db2_fal_attrib $dblist set $FIRST_ACTIVE_LOG || return $OCF_ERR_GENERIC exit $OCF_SUCCESS } ######## # Main # ######## case "$__OCF_ACTION" in meta-data) db2_meta_data exit $OCF_SUCCESS ;; usage) db2_usage exit $OCF_SUCCESS ;; start) db2_validate db2_start || exit $? db2_monitor exit $? ;; stop) db2_validate db2_stop exit $? ;; promote) db2_validate db2_promote exit $? ;; demote) db2_validate db2_demote exit $? ;; notify) db2_validate db2_notify exit $? ;; monitor) db2_validate db2_monitor exit $? ;; validate-all) db2_validate exit $? ;; *) db2_usage exit $OCF_ERR_UNIMPLEMENTED esac diff --git a/heartbeat/docker b/heartbeat/docker index 87f94a74f..653b0c359 100755 --- a/heartbeat/docker +++ b/heartbeat/docker @@ -1,564 +1,564 @@ #!/bin/sh # # The docker HA resource agent creates and launches a docker container # based off a supplied docker image. Containers managed by this agent # are both created and removed upon the agent's start and stop actions. # # Copyright (c) 2014 David Vossel # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### meta_data() { cat < 1.0 The docker HA resource agent creates and launches a docker container based off a supplied docker image. Containers managed by this agent are both created and removed upon the agent's start and stop actions. Docker container resource agent. The docker image to base this container off of. docker image The name to give the created container. By default this will be that resource's instance name. docker container name Allow the image to be pulled from the configured docker registry when the image does not exist locally. NOTE, this can drastically increase the time required to start the container if the image repository is pulled over the network. Allow pulling non-local images Add options to be appended to the 'docker run' command which is used when creating the container during the start action. This option allows users to do things such as setting a custom entry point and injecting environment variables into the newly created container. Note the '-d' option is supplied regardless of this value to force containers to run in the background. NOTE: Do not explicitly specify the --name argument in the run_opts. This agent will set --name using either the resource's instance or the name provided in the 'name' argument of this agent. run options -Specifiy a command to launch within the container once +Specify a command to launch within the container once it has initialized. run command A comma separated list of directories that the container is expecting to use. The agent will ensure they exist by running 'mkdir -p' Required mount points -Specifiy the full path of a command to launch within the container to check +Specify the full path of a command to launch within the container to check the health of the container. This command must return 0 to indicate that the container is healthy. A non-zero return code will indicate that the container has failed and should be recovered. If 'docker exec' is supported, it is used to execute the command. If not, nsenter is used. Note: Using this method for monitoring processes inside a container is not recommended, as containerd tries to track processes running inside the container and does not deal well with many short-lived processes being spawned. Ensure that your container monitors its own processes and terminates on fatal error rather than invoking a command from the outside. monitor command Kill a container immediately rather than waiting for it to gracefully shutdown force kill Allow the container to be reused after stopping the container. By default containers are removed after stop. With the reuse option containers will persist after the container stops. reuse container Query the builtin healthcheck of docker (v1.12+) to determine health of the container. If left empty or set to false it will not be used. The healthcheck itself has to be configured within docker, e.g. via HEALTHCHECK in Dockerfile. This option just queries in what condition docker considers the container to be and lets ocf do its thing accordingly. Note that the time a container is in "starting" state counts against the monitor timeout. This is an additional check besides the standard check for the container to be running, and the optional monitor_cmd check. It doesn't disable or override them, so all of them (if used) have to come back healthy for the container to be considered healthy. use healthcheck END } ####################################################################### REQUIRE_IMAGE_PULL=0 docker_usage() { cat </dev/null 2>&1; then out=$(docker exec ${CONTAINER} $OCF_RESKEY_monitor_cmd 2>&1) rc=$? else out=$(echo "$OCF_RESKEY_monitor_cmd" | nsenter --target $(docker inspect --format {{.State.Pid}} ${CONTAINER}) --mount --uts --ipc --net --pid 2>&1) rc=$? fi if [ $rc -eq 127 ]; then ocf_log err "monitor cmd failed (rc=$rc), output: $out" ocf_exit_reason "monitor_cmd, ${OCF_RESKEY_monitor_cmd} , not found within container." # there is no recovering from this, exit immediately exit $OCF_ERR_ARGS elif [ $rc -ne 0 ]; then ocf_exit_reason "monitor cmd failed (rc=$rc), output: $out" rc=$OCF_ERR_GENERIC else ocf_log debug "monitor cmd passed: exit code = $rc" fi return $rc } container_exists() { docker inspect --format {{.State.Running}} $CONTAINER | egrep '(true|false)' >/dev/null 2>&1 } remove_container() { if ocf_is_true "$OCF_RESKEY_reuse"; then # never remove the container if we have reuse enabled. return 0 fi container_exists if [ $? -ne 0 ]; then # don't attempt to remove a container that doesn't exist return 0 fi ocf_log notice "Cleaning up inactive container, ${CONTAINER}." ocf_run docker rm $CONTAINER } docker_simple_status() { local val container_exists if [ $? -ne 0 ]; then return $OCF_NOT_RUNNING fi # retrieve the 'Running' attribute for the container val=$(docker inspect --format {{.State.Running}} $CONTAINER 2>/dev/null) if [ $? -ne 0 ]; then #not running as a result of container not being found return $OCF_NOT_RUNNING fi if ocf_is_true "$val"; then # container exists and is running return $OCF_SUCCESS fi return $OCF_NOT_RUNNING } docker_health_status() { if ocf_is_true "$OCF_RESKEY_query_docker_health"; then local val container_exists if [ $? -ne 0 ]; then return $OCF_NOT_RUNNING fi # retrieve the 'Health' attribute for the container # This is a bash-style do-while loop to wait until instance is started. # if starting takes longer than monitor timeout then upstream will make this fail. while val=$(docker inspect --format {{.State.Health.Status}} $CONTAINER 2>/dev/null) if [ $? -ne 0 ]; then #not healthy as a result of container not being found return $OCF_NOT_RUNNING fi test "$val" = "starting" do sleep 1 done if [ "$val" = "healthy" ]; then # container exists and is healthy return $OCF_SUCCESS fi return $OCF_NOT_RUNNING fi return 0 } docker_monitor() { local rc=0 docker_simple_status rc=$? if [ $rc -ne 0 ]; then return $rc fi docker_health_status rc=$? if [ $rc -ne 0 ]; then return $rc fi monitor_cmd_exec } docker_create_mounts() { oldIFS="$IFS" IFS="," for directory in $OCF_RESKEY_mount_points; do mkdir -p "$directory" done IFS="$oldIFS" } docker_start() { docker_create_mounts local run_opts="-d --name=${CONTAINER}" # check to see if the container has already started docker_simple_status if [ $? -eq $OCF_SUCCESS ]; then return $OCF_SUCCESS fi if [ -n "$OCF_RESKEY_run_opts" ]; then run_opts="$run_opts $OCF_RESKEY_run_opts" fi if [ $REQUIRE_IMAGE_PULL -eq 1 ]; then ocf_log notice "Beginning pull of image, ${OCF_RESKEY_image}" docker pull "${OCF_RESKEY_image}" if [ $? -ne 0 ]; then ocf_exit_reason "failed to pull image ${OCF_RESKEY_image}" return $OCF_ERR_GENERIC fi fi if ocf_is_true "$OCF_RESKEY_reuse" && container_exists; then ocf_log info "starting existing container $CONTAINER." ocf_run docker start $CONTAINER else # make sure any previous container matching our container name is cleaned up first. # we already know at this point it wouldn't be running remove_container ocf_log info "running container $CONTAINER for the first time" ocf_run docker run $run_opts $OCF_RESKEY_image $OCF_RESKEY_run_cmd fi if [ $? -ne 0 ]; then ocf_exit_reason "docker failed to launch container" return $OCF_ERR_GENERIC fi # wait for monitor to pass before declaring that the container is started while true; do docker_simple_status if [ $? -ne $OCF_SUCCESS ]; then ocf_exit_reason "Newly created docker container exited after start" return $OCF_ERR_GENERIC fi monitor_cmd_exec if [ $? -eq $OCF_SUCCESS ]; then ocf_log notice "Container $CONTAINER started successfully" return $OCF_SUCCESS fi ocf_exit_reason "waiting on monitor_cmd to pass after start" sleep 1 done } docker_stop() { local timeout=60 docker_simple_status if [ $? -eq $OCF_NOT_RUNNING ]; then remove_container return $OCF_SUCCESS fi if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000) -10 )) if [ $timeout -lt 10 ]; then timeout=10 fi fi if ocf_is_true "$OCF_RESKEY_force_kill"; then ocf_run docker kill $CONTAINER else ocf_log debug "waiting $timeout second[s] before killing container" ocf_run docker stop -t=$timeout $CONTAINER fi if [ $? -ne 0 ]; then ocf_exit_reason "Failed to stop container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." return $OCF_ERR_GENERIC fi remove_container if [ $? -ne 0 ]; then ocf_exit_reason "Failed to remove stopped container, ${CONTAINER}, based on image, ${OCF_RESKEY_image}." return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } image_exists() { # if no tag was specified, use default "latest" local COLON_FOUND=0 local SLASH_FOUND=0 local SERVER_NAME="" local IMAGE_NAME="${OCF_RESKEY_image}" local IMAGE_TAG="latest" SLASH_FOUND="$(echo "${OCF_RESKEY_image}" | grep -o '/' | grep -c .)" if [ ${SLASH_FOUND} -ge 1 ]; then SERVER_NAME="$(echo ${IMAGE_NAME} | cut -d / -f 1-${SLASH_FOUND})" IMAGE_NAME="$(echo ${IMAGE_NAME} | awk -F'/' '{print $NF}')" fi COLON_FOUND="$(echo "${IMAGE_NAME}" | grep -o ':' | grep -c .)" if [ ${COLON_FOUND} -ge 1 ]; then IMAGE_TAG="$(echo ${IMAGE_NAME} | awk -F':' '{print $NF}')" IMAGE_NAME="$(echo ${IMAGE_NAME} | cut -d : -f 1-${COLON_FOUND})" fi # IMAGE_NAME might be following formats: # - image # - repository:port/image # - docker.io/image (some distro will display "docker.io/" as prefix) docker images | awk '{print $1 ":" $2}' | egrep -q -s "^(docker.io\/|${SERVER_NAME}\/)?${IMAGE_NAME}:${IMAGE_TAG}\$" if [ $? -eq 0 ]; then # image found return 0 fi if ocf_is_true "$OCF_RESKEY_allow_pull"; then REQUIRE_IMAGE_PULL=1 ocf_log notice "Image (${OCF_RESKEY_image}) does not exist locally but will be pulled during start" return 0 fi # image not found. return 1 } docker_validate() { check_binary docker if [ -z "$OCF_RESKEY_image" ]; then ocf_exit_reason "'image' option is required" exit $OCF_ERR_CONFIGURED fi if [ -n "$OCF_RESKEY_monitor_cmd" ]; then docker exec --help >/dev/null 2>&1 if [ ! $? ]; then ocf_log info "checking for nsenter, which is required when 'monitor_cmd' is specified" check_binary nsenter fi fi image_exists if [ $? -ne 0 ]; then ocf_exit_reason "base image, ${OCF_RESKEY_image}, could not be found." exit $OCF_ERR_CONFIGURED fi return $OCF_SUCCESS } # TODO : # When a user starts plural clones in a node in globally-unique, a user cannot appoint plural name parameters. # When a user appoints reuse, the resource agent cannot connect plural clones with a container. if ocf_is_true "$OCF_RESKEY_CRM_meta_globally_unique"; then if [ -n "$OCF_RESKEY_name" ]; then if [ -n "$OCF_RESKEY_CRM_meta_clone_node_max" ] && [ "$OCF_RESKEY_CRM_meta_clone_node_max" -ne 1 ] then ocf_exit_reason "Cannot make plural clones from the same name parameter." exit $OCF_ERR_CONFIGURED fi if [ -n "$OCF_RESKEY_CRM_meta_master_node_max" ] && [ "$OCF_RESKEY_CRM_meta_master_node_max" -ne 1 ] then ocf_exit_reason "Cannot make plural master from the same name parameter." exit $OCF_ERR_CONFIGURED fi fi : ${OCF_RESKEY_name=`echo ${OCF_RESOURCE_INSTANCE} | tr ':' '-'`} else : ${OCF_RESKEY_name=${OCF_RESOURCE_INSTANCE}} fi if [ -n "$OCF_RESKEY_container" ]; then # we'll keep the container attribute around for a bit in order not to break # any existing deployments. The 'name' attribute is prefered now though. CONTAINER=$OCF_RESKEY_container ocf_log warn "The 'container' attribute is depreciated" else CONTAINER=$OCF_RESKEY_name fi case $__OCF_ACTION in meta-data) meta_data exit $OCF_SUCCESS;; start) docker_validate docker_start;; stop) docker_stop;; monitor) docker_monitor;; validate-all) docker_validate;; usage|help) docker_usage exit $OCF_SUCCESS ;; *) docker_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac rc=$? ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" exit $rc diff --git a/heartbeat/ethmonitor b/heartbeat/ethmonitor index 21bf12be7..240eba4df 100755 --- a/heartbeat/ethmonitor +++ b/heartbeat/ethmonitor @@ -1,557 +1,557 @@ #!/bin/sh # # OCF Resource Agent compliant script. # Monitor the vitality of a local network interface. # # Based on the work by Robert Euhus and Lars Marowsky-Bree. # # Transfered from Ipaddr2 into ethmonitor by Alexander Krauth # # Copyright (c) 2011 Robert Euhus, Alexander Krauth, Lars Marowsky-Brée # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # OCF parameters are as below # # OCF_RESKEY_interface # OCF_RESKEY_multiplicator # OCF_RESKEY_name # OCF_RESKEY_repeat_count # OCF_RESKEY_repeat_interval # OCF_RESKEY_pktcnt_timeout # OCF_RESKEY_arping_count # OCF_RESKEY_arping_timeout # OCF_RESKEY_arping_cache_entries # # TODO: Check against IPv6 # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### meta_data() { cat < 1.2 Monitor the vitality of a local network interface. You may set up this RA as a clone resource to monitor the network interfaces on different nodes, with the same interface name. This is not related to the IP address or the network on which a interface is configured. You may use this RA to move resources away from a node, which has a faulty interface or prevent moving resources to such a node. -This gives you independend control of the resources, without involving cluster intercommunication. But it requires your nodes to have more than one network interface. +This gives you independent control of the resources, without involving cluster intercommunication. But it requires your nodes to have more than one network interface. The resource configuration requires a monitor operation, because the monitor does the main part of the work. In addition to the resource configuration, you need to configure some location constraints, based on a CIB attribute value. The name of the attribute value is configured in the 'name' option of this RA. Example constraint configuration using crmsh location loc_connected_node my_resource_grp \ rule $id="rule_loc_connected_node" -INF: ethmonitor eq 0 Example constraint configuration using pcs. Only allow 'my_resource' to run on nodes where eth0 ethernet device is available. pcs constraint location my_resource rule score=-INFINITY ethmonitor-eth0 ne 1 The ethmonitor works in 3 different modes to test the interface vitality. 1. call ip to see if the link status is up (if link is down -> error) 2. call ip and watch the RX counter (if packages come around in a certain time -> success) 3. call arping to check whether any of the IPs found in the local ARP cache answers an ARP REQUEST (one answer -> success) 4. return error Monitors network interfaces The name of the network interface which should be monitored (e.g. eth0). Network interface name The name of the CIB attribute to set. This is the name to be used in the constraints. Defaults to "ethmonitor-'interface_name'". Attribute name Multiplier for the value of the CIB attriobute specified in parameter name. Multiplier for result variable Specify how often the interface will be monitored, before the status is set to failed. You need to set the timeout of the monitoring operation to at least repeat_count * repeat_interval Monitor repeat count Specify how long to wait in seconds between the repeat_counts. Monitor repeat interval in seconds Timeout for the RX packet counter. Stop listening for packet counter changes after the given number of seconds. packet counter timeout Number of ARP REQUEST packets to send for every IP. Usually one ARP REQUEST (arping) is send Number of arpings per IP Time in seconds to wait for ARP REQUESTs (all packets of arping_count). This is to limit the time for arp requests, to be able to send requests to more than one node, without running in the monitor operation timeout. Timeout for arpings per IP Maximum number of IPs from ARP cache list to check for ARP REQUEST (arping) answers. Newest entries are tried first. Number of ARP cache entries to try For interfaces that are infiniband devices. infiniband device For infiniband devices, this is the port to monitor. infiniband port Only report success based on link status. Do not perform RX counter or arping related connectivity tests. link status check only END exit $OCF_SUCCESS } # # Return true, if the interface exists # is_interface() { # # List interfaces but exclude FreeS/WAN ipsecN virtual interfaces # local iface=`$IP2UTIL -o -f inet addr show | grep " $1 " \ | cut -d ' ' -f2 | sort -u | grep -v '^ipsec[0-9][0-9]*$'` [ "$iface" != "" ] } infiniband_status() { local device="$OCF_RESKEY_infiniband_device" if [ -n "$OCF_RESKEY_infiniband_port" ]; then device="${OCF_RESKEY_infiniband_device}:${OCF_RESKEY_infiniband_port}" fi case "${OCF_RESKEY_infiniband_device}" in *ib*|*mlx*) ibstatus ${device} | grep -q ACTIVE ;; *hfi*) opainfo | grep -q Active ;; esac } if_init() { local rc if [ X"$OCF_RESKEY_interface" = "X" ]; then ocf_exit_reason "Interface name (the interface parameter) is mandatory" exit $OCF_ERR_CONFIGURED fi NIC="$OCF_RESKEY_interface" if is_interface $NIC then case "$NIC" in *:*) ocf_exit_reason "Do not specify a virtual interface : $OCF_RESKEY_interface" exit $OCF_ERR_CONFIGURED;; *) ;; esac else case $__OCF_ACTION in validate-all) ocf_exit_reason "Interface $NIC does not exist" exit $OCF_ERR_CONFIGURED;; *) ## It might be a bond interface which is temporarily not available, therefore we want to continue here ocf_log warn "Interface $NIC does not exist" ;; esac fi : ${OCF_RESKEY_multiplier:="1"} if ! ocf_is_decimal "$OCF_RESKEY_multiplier"; then ocf_exit_reason "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]" exit $OCF_ERR_CONFIGURED fi ATTRNAME=${OCF_RESKEY_name:-"ethmonitor-$NIC"} REP_COUNT=${OCF_RESKEY_repeat_count:-5} if ! ocf_is_decimal "$REP_COUNT" -o [ $REP_COUNT -lt 1 ]; then ocf_exit_reason "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]" exit $OCF_ERR_CONFIGURED fi REP_INTERVAL_S=${OCF_RESKEY_repeat_interval:-10} if ! ocf_is_decimal "$REP_INTERVAL_S"; then ocf_exit_reason "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]" exit $OCF_ERR_CONFIGURED fi : ${OCF_RESKEY_pktcnt_timeout:="5"} if ! ocf_is_decimal "$OCF_RESKEY_pktcnt_timeout"; then ocf_exit_reason "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]" exit $OCF_ERR_CONFIGURED fi : ${OCF_RESKEY_arping_count:="1"} if ! ocf_is_decimal "$OCF_RESKEY_arping_count"; then ocf_exit_reason "Invalid OCF_RESKEY_arping_count [$OCF_RESKEY_arping_count]" exit $OCF_ERR_CONFIGURED fi : ${OCF_RESKEY_arping_timeout:="1"} if ! ocf_is_decimal "$OCF_RESKEY_arping_timeout"; then ocf_exit_reason "Invalid OCF_RESKEY_arping_timeout [$OCF_RESKEY_arping_count]" exit $OCF_ERR_CONFIGURED fi : ${OCF_RESKEY_arping_cache_entries:="5"} if ! ocf_is_decimal "$OCF_RESKEY_arping_cache_entries"; then ocf_exit_reason "Invalid OCF_RESKEY_arping_cache_entries [$OCF_RESKEY_arping_cache_entries]" exit $OCF_ERR_CONFIGURED fi if [ -n "$OCF_RESKEY_infiniband_device" ]; then #ibstatus or opainfo is required if an infiniband_device is provided case "${OCF_RESKEY_infiniband_device}" in *ib*|*mlx*) check_binary ibstatus ;; *hfi*) check_binary opainfo ;; esac fi return $OCF_SUCCESS } # get the link status on $NIC # asks ip about running (up) interfaces, returns the number of matching interface names that are up get_link_status () { $IP2UTIL -o link show up dev "$NIC" | grep -v 'NO-CARRIER' | grep -c "$NIC" } # returns the number of received rx packets on $NIC get_rx_packets () { ocf_log debug "$IP2UTIL -o -s link show dev $NIC" $IP2UTIL -o -s link show dev "$NIC" \ | sed 's/.* RX: [^0-9]*[0-9]* *\([0-9]*\) .*/\1/' # the first number after RX: is the # of bytes , # the second is the # of packets received } # watch for packet counter changes for max. OCF_RESKEY_pktcnt_timeout seconds # returns immedeately with return code 0 if any packets were received # otherwise 1 is returned watch_pkt_counter () { local RX_PACKETS_NEW local RX_PACKETS_OLD RX_PACKETS_OLD="`get_rx_packets`" for n in `seq $(( $OCF_RESKEY_pktcnt_timeout * 10 ))`; do sleep 0.1 RX_PACKETS_NEW="`get_rx_packets`" ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW" if [ "$RX_PACKETS_OLD" -ne "$RX_PACKETS_NEW" ]; then ocf_log debug "we received some packets." return 0 fi done return 1 } # returns list of cached ARP entries for $NIC # sorted by age ("last confirmed") # max. OCF_RESKEY_arping_cache_entries entries get_arp_list () { $IP2UTIL -s neighbour show dev $NIC \ | sort -t/ -k2,2n | cut -d' ' -f1 \ | head -n $OCF_RESKEY_arping_cache_entries # the "used" entries in `ip -s neighbour show` are: # "last used"/"last confirmed"/"last updated" } # arping the IP given as argument $1 on $NIC # until OCF_RESKEY_arping_count answers are received do_arping () { # TODO: add the source IP # TODO: check for diffenrent arping versions out there arping -q -c $OCF_RESKEY_arping_count -w $OCF_RESKEY_arping_timeout -I $NIC $1 # return with the exit code of the arping command return $? } # # Check the interface depending on the level given as parameter: $OCF_RESKEY_check_level # # 09: check for nonempty ARP cache # 10: watch for packet counter changes # # 19: check arping_ip_list # 20: check arping ARP cache entries # # 30: watch for packet counter changes in promiscios mode # # If unsuccessfull in levels 18 and above, # the tests for higher check levels are run. # if_check () { local arp_list # always check link status first link_status="`get_link_status`" ocf_log debug "link_status: $link_status (1=up, 0=down)" if [ $link_status -eq 0 ]; then ocf_log notice "link_status: DOWN" return $OCF_NOT_RUNNING fi # if this is an infiniband device, try ibstatus script if [ -n "$OCF_RESKEY_infiniband_device" ]; then if infiniband_status; then return $OCF_SUCCESS fi ocf_log info "Infiniband device $OCF_RESKEY_infiniband_device is not available, check ibstatus for more information" return $OCF_NOT_RUNNING fi # if using link_status_only, skip RX count and arping related tests if ocf_is_true "$OCF_RESKEY_link_status_only"; then return $OCF_SUCCESS fi # watch for packet counter changes ocf_log debug "watch for packet counter changes" watch_pkt_counter if [ $? -eq 0 ]; then return $OCF_SUCCESS else ocf_log debug "No packets received during packet watch timeout" fi # check arping ARP cache entries ocf_log debug "check arping ARP cache entries" arp_list=`get_arp_list` for ip in `echo $arp_list`; do do_arping $ip && return $OCF_SUCCESS done # if we get here, the ethernet device is considered not running. # provide some logging information if [ -z "$arp_list" ]; then ocf_log info "No ARP cache entries found to arping" fi # watch for packet counter changes in promiscios mode # ocf_log debug "watch for packet counter changes in promiscios mode" # be sure switch off promiscios mode in any case # TODO: check first, wether promisc is already on and leave it untouched. # trap "$IP2UTIL link set dev $NIC promisc off; exit" INT TERM EXIT # $IP2UTIL link set dev $NIC promisc on # watch_pkt_counter && return $OCF_SUCCESS # $IP2UTIL link set dev $NIC promisc off # trap - INT TERM EXIT # looks like it's not working (for whatever reason) return $OCF_NOT_RUNNING } ####################################################################### if_usage() { cat < /dev/null` sleep $sleep_time 2> /dev/null runs=$(($runs + 1)) fi if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error" fi done ocf_log debug "Monitoring return code: $mon_rc" if [ $mon_rc -eq $OCF_SUCCESS ]; then set_cib_value 1 attr_rc=$? else ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed." set_cib_value 0 attr_rc=$? fi ## The resource should not fail, if the interface is down. It should fail, if the update of the CIB variable has errors. ## To react on the interface failure you must use constraints based on the CIB variable value, not on the resource itself. exit $attr_rc } if_stop() { attrd_updater -D -n $ATTRNAME ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop } if_start() { local rc ha_pseudo_resource $OCF_RESOURCE_INSTANCE start rc=$? if [ $rc -ne $OCF_SUCCESS ]; then ocf_exit_reason "Failure to create ethmonitor state file" return $rc fi # perform the first monitor during the start operation if_monitor return $? } if_validate() { check_binary $IP2UTIL check_binary arping if_init } case $__OCF_ACTION in meta-data) meta_data ;; usage|help) if_usage exit $OCF_SUCCESS ;; esac if_validate case $__OCF_ACTION in start) if_start exit $? ;; stop) if_stop exit $? ;; monitor|status) if_monitor exit $? ;; validate-all) exit $? ;; *) if_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/heartbeat/iface-vlan b/heartbeat/iface-vlan index bc8583cd4..783fa5b11 100755 --- a/heartbeat/iface-vlan +++ b/heartbeat/iface-vlan @@ -1,475 +1,475 @@ #!/bin/sh # # OCF Resource Agent compliant iface-vlan script. # # Implements network VLAN interface management # # Copyright (C) 2013 Red Hat, Inc. All rights reserved. # Author: Fabio M. Di Nitto # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # # TODO: # # OCF parameters are as below # OCF_RESKEY_vlan_interface # OCF_RESKEY_vlan_id # OCF_RESKEY_vlan_name # OCF_RESKEY_vlan_reorder_hdr # OCF_RESKEY_vlan_gvrp # OCF_RESKEY_vlan_mvrp # OCF_RESKEY_vlan_loose_binding # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults OCF_RESKEY_vlan_reorder_hdr_default=1 OCF_RESKEY_vlan_gvrp_default=0 OCF_RESKEY_vlan_mvrp_default=0 OCF_RESKEY_vlan_loose_binding_default=0 OCF_RESKEY_vlan_name_default=${OCF_RESKEY_vlan_interface}.${OCF_RESKEY_vlan_id} : ${OCF_RESKEY_vlan_name=${OCF_RESKEY_vlan_name_default}} : ${OCF_RESKEY_vlan_reorder_hdr=${OCF_RESKEY_vlan_reorder_hdr_default}} : ${OCF_RESKEY_vlan_gvrp=${OCF_RESKEY_vlan_gvrp_default}} # don't set defaults for mvrp or loose binding since both # are rather new kernel features and they might not be supported #: ${OCF_RESKEY_vlan_mvrp=${OCF_RESKEY_vlan_mvrp_default}} #: ${OCF_RESKEY_vlan_loose_binding=${OCF_RESKEY_vlan_loose_binding_default}} ####################################################################### vlan_usage() { cat < 1.0 This resource manages VLAN network interfaces. It can add, remove, configure VLANs. Manages VLAN network interfaces. Define the interface where VLAN should be attached. Network interface. Define the VLAN ID. It has to be a value between 0 and 4094. Define the VLAN ID. Define the name of the VLAN interface (max 15 charaters). Name of the VLAN. Enable or disable header reordering. Enable or disable header reordering. Enable or disable GARP VLAN registration protocol. Enable or disable gvrp. Enable or disable Multiple VLAN Registration Protocol. Please note that most distributions do not ship a version of iproute2 that supports mvrp yet, even if the kernel has support for it. Check output of $IPADDR2 link add type vlan --help in the FLAG section to verify if mvrp support is available. Enable or disable mvrp. Enable or disable VLAN loose bind. By default the VLAN interface - admin status (UP/DOWN) follows the underneath inteface status. + admin status (UP/DOWN) follows the underneath interface status. Enabling loose bind allows the VLAN to disconnect from the interface status. Be very careful that enabling loose binding could invalidate this agent monitor operations. Please note that most distributions do not ship a version of iproute2 that supports loose_binding yet, even if the kernel has support for it. Check output of $IPADDR2 link add type vlan --help in the FLAG section to verify if loose_binding support is available. Enable or disable loose binding. END } # check if the interface is admin up/down iface_is_up() { if ! $IP2UTIL -o link show $1 | \ sed -e 's#.*<##g' -e 's#>.*##' -e 's#LOWER_UP##g' | \ grep -q UP; then return 1 fi return 0 } # check if the slaves have link layer up/down # see kernel network documentation on meaning of LOWER_UP flag # for more in depth explanation on how it works # NOTE: this check is not reliable in virt environment # since interfaces are always LOWER_UP. There is no way # from the guest to know if the host has disconnected somehow iface_lower_is_up() { if ! $IP2UTIL -o link show $1 | \ grep -q LOWER_UP; then return 1 fi return 0 } vlan_validate() { check_binary $IP2UTIL if [ -z "$OCF_RESKEY_vlan_interface" ]; then ocf_log err "Invalid OCF_RESKEY_vlan_interface: value cannot be empty" return 1 fi # the echo .. is the equivalent of strlen in bash # # /usr/include/linux/if.h:#define IFNAMSIZ 16 # needs to include 0 byte end string if [ "${#OCF_RESKEY_vlan_interface}" -gt 15 ]; then ocf_log err "Invalid OCF_RESKEY_vlan_interface: name is too long" return 1 fi if [ ! -d "/sys/class/net" ]; then ocf_log err "Unable to find sysfs network class in /sys" return 1 fi if [ ! -e "/sys/class/net/$OCF_RESKEY_vlan_interface" ]; then ocf_log err "Invalid OCF_RESKEY_vlan_interface: $OCF_RESKEY_vlan_interface does not exists" return 1 fi if [ -z "$OCF_RESKEY_vlan_id" ]; then ocf_log err "Invalid OCF_RESKEY_vlan_id: value cannot be empty" return 1 fi if ! ocf_is_decimal "$OCF_RESKEY_vlan_id" || \ [ "$OCF_RESKEY_vlan_id" -gt "4094" ]; then ocf_log err "Invalid OCF_RESKEY_vlan_id: must be a decimal value (0 to 4094 included)" return 1 fi if [ "${#OCF_RESKEY_vlan_name}" -gt 15 ]; then ocf_log err "Invalid OCF_RESKEY_vlan_name: name is too long" return 1 fi return 0 } vlan_check() { if [ -e "/sys/class/net/$OCF_RESKEY_vlan_name" ]; then if [ ! -e "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" ]; then return $OCF_ERR_GENERIC fi else if [ -e "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" ]; then error="$(rm -f "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" 2>&1)" if [ "$?" != "0" ]; then ocf_log err "Unable to remove stale lock file for vlan $OCF_RESKEY_vlan_name: $error" return $OCF_ERR_GENERIC fi fi return $OCF_NOT_RUNNING fi if ! iface_is_up $OCF_RESKEY_vlan_interface; then if ocf_is_true "$OCF_RESKEY_vlan_loose_binding"; then ocf_log warn "Interface $OCF_RESKEY_vlan_interface is administratively down" else ocf_log err "Interface $OCF_RESKEY_vlan_interface is administratively down" return $OCF_ERR_GENERIC fi fi if ! iface_is_up $OCF_RESKEY_vlan_name; then ocf_log err "VLAN $OCF_RESKEY_vlan_name is administratively down" return $OCF_ERR_GENERIC fi if ! iface_lower_is_up $OCF_RESKEY_vlan_name; then ocf_log err "VLAN $OCF_RESKEY_vlan_name has no active link-layer" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } # we need a simpler stop version to clean after us if start fails # without involving any error checking # rolling back in case of failure is otherwise complex vlan_force_stop() { $IP2UTIL link delete "$OCF_RESKEY_vlan_name" >/dev/null 2>&1 rm -f "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" 2>&1 } vlan_start() { # check if the vlan already exists vlan_check ret=$? if [ "$ret" != "$OCF_NOT_RUNNING" ]; then return $ret fi # make sure kernel module is loaded if [ ! -e /proc/net/vlan ]; then error="$(modprobe 8021q 2>&1)" if [ "$?" != "0" ]; then ocf_log err "Unable to load kernel 8021q driver: $error" return $OCF_ERR_GENERIC fi fi # generate options VLANOPTS="" if [ -n "$OCF_RESKEY_vlan_reorder_hdr" ]; then if ocf_is_true "$OCF_RESKEY_vlan_reorder_hdr"; then VLANOPTS="reorder_hdr on" else VLANOPTS="reorder_hdr off" fi fi if [ -n "$OCF_RESKEY_vlan_gvrp" ]; then if ocf_is_true "$OCF_RESKEY_vlan_gvrp"; then VLANOPTS="$VLANOPTS gvrp on" else VLANOPTS="$VLANOPTS gvrp off" fi fi if [ -n "$OCF_RESKEY_vlan_mvrp" ]; then if ocf_is_true "$OCF_RESKEY_vlan_mvrp"; then VLANOPTS="$VLANOPTS mvrp on" else VLANOPTS="$VLANOPTS mvrp off" fi fi if [ -n "$OCF_RESKEY_vlan_loose_binding" ]; then if ocf_is_true "$OCF_RESKEY_vlan_loose_binding"; then VLANOPTS="$VLANOPTS loose_binding on" else VLANOPTS="$VLANOPTS loose_binding off" fi fi # create the VLAN error="$($IP2UTIL link add link "$OCF_RESKEY_vlan_interface" name "$OCF_RESKEY_vlan_name" type vlan id "$OCF_RESKEY_vlan_id" $VLANOPTS 2>&1)" if [ "$?" != "0" ]; then ocf_log err "Unable to create VLAN $OCF_RESKEY_vlan_name: $error" return $OCF_ERR_GENERIC fi # set the interface up error="$($IP2UTIL link set dev "$OCF_RESKEY_vlan_interface" up 2>&1)" if [ "$?" != "0" ]; then ocf_log err "Unable to set VLAN $OCF_RESKEY_vlan_interface up: $error" return $OCF_ERR_GENERIC fi # set the vlan up error="$($IP2UTIL link set dev "$OCF_RESKEY_vlan_name" up 2>&1)" if [ "$?" != "0" ]; then ocf_log err "Unable to set VLAN $OCF_RESKEY_vlan_name up: $error" return $OCF_ERR_GENERIC fi error="$(touch "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" 2>&1)" if [ "$?" != "0" ]; then ocf_log err "Unable to create lock file for VLAN $OCF_RESKEY_vlan_name: $error" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } vlan_stop() { vlan_check ret=$? if [ "$ret" = "$OCF_NOT_RUNNING" ]; then return $OCF_SUCCESS fi if [ "$ret" != "$OCF_SUCCESS" ]; then return $ret fi # set vlan down error="$($IP2UTIL link set dev "$OCF_RESKEY_vlan_name" down 2>&1)" if [ "$?" != "0" ]; then ocf_log err "Unable to set VLAN $OCF_RESKEY_vlan_name down: $error" return $OCF_ERR_GENERIC fi # delete vlan error="$($IP2UTIL link delete "$OCF_RESKEY_vlan_name" 2>&1)" if [ "$?" != "0" ]; then ocf_log err "Unable to delete VLAN $OCF_RESKEY_vlan_name: $error" return $OCF_ERR_GENERIC fi error="$(rm -f "$HA_RSCTMP/iface-vlan.$OCF_RESKEY_vlan_name" 2>&1)" if [ "$?" != "0" ]; then ocf_log err "Unable to remove lock file for VLAN $OCF_RESKEY_vlan_name: $error" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } case $__OCF_ACTION in meta-data) vlan_meta_data exit $OCF_SUCCESS ;; usage|help) vlan_usage exit $OCF_SUCCESS ;; esac if [ ! -d "$HA_RSCTMP" ]; then ocf_log debug "$HA_RSCTMP not found, we are probably being executed manually" mkdir -p "$HA_RSCTMP" fi if [ -n "$__OCF_ACTION" ] && ! vlan_validate; then exit $OCF_ERR_CONFIGURED fi case $__OCF_ACTION in start|stop) if ! ocf_is_root; then ocf_log err "You must be root for $__OCF_ACTION operation." exit $OCF_ERR_PERM fi ;; esac case $__OCF_ACTION in start) vlan_start ret=$? if [ "$ret" != "$OCF_SUCCESS" ]; then vlan_force_stop fi exit $ret ;; stop) vlan_stop exit $? ;; status|monitor) vlan_check exit $? ;; validate-all) # vlan_validate above does the trick ;; *) vlan_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac # vi:sw=4:ts=8: diff --git a/heartbeat/kamailio.in b/heartbeat/kamailio.in index 5a401926f..3e83833c8 100644 --- a/heartbeat/kamailio.in +++ b/heartbeat/kamailio.in @@ -1,741 +1,741 @@ #!@BASH_SHELL@ # # OCF resource agent for Kamailio for pacemaker # # Copyright (c) 2013 FREQUENTIS AG, # Authors: Stefan Wenk # Rainer Brestan # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # OCF input parameters: # OCF_RESKEY_binary # OCF_RESKEY_conffile # OCF_RESKEY_pidfile # OCF_RESKEY_monitoring_ip # OCF_RESKEY_listen_address # OCF_RESKEY_port # OCF_RESKEY_proto # OCF_RESKEY_sipsak # OCF_RESKEY_kamctl # OCF_RESKEY_kamctlrc # OCF_RESKEY_kamuser # OCF_RESKEY_kamgroup # OCF_RESKEY_extra_options # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### # Defaults RESKEY_binary_default="/usr/sbin/kamailio" RESKEY_conffile_default="/etc/kamailio/kamailio.cfg" RESKEY_pidfile_default="/var/run/kamailio_${OCF_RESOURCE_INSTANCE}/kamailio.pid" RESKEY_monitoring_ip_default=127.0.0.1 RESKEY_port_default=5060 RESKEY_proto_default="udptcp" RESKEY_sipsak_default="/usr/bin/sipsak" RESKEY_kamctl_default="/usr/bin/kamctl" RESKEY_kamctlrc_default="/etc/kamailio/kamctlrc" RESKEY_kamuser_default="" RESKEY_kamgroup_default="" RESKEY_extra_options_default="" ####################################################################### : ${OCF_RESKEY_binary=${RESKEY_binary_default}} : ${OCF_RESKEY_conffile=${RESKEY_conffile_default}} : ${OCF_RESKEY_pidfile=${RESKEY_pidfile_default}} : ${OCF_RESKEY_monitoring_ip=${RESKEY_monitoring_ip_default}} : ${OCF_RESKEY_port=${RESKEY_port_default}} : ${OCF_RESKEY_proto=${RESKEY_proto_default}} : ${OCF_RESKEY_sipsak=${RESKEY_sipsak_default}} : ${OCF_RESKEY_kamctl=${RESKEY_kamctl_default}} : ${OCF_RESKEY_kamctlrc=${RESKEY_kamctlrc_default}} : ${OCF_RESKEY_kamuser=${RESKEY_kamuser_default}} : ${OCF_RESKEY_kamgroup=${RESKEY_kamgroup_default}} : ${OCF_RESKEY_extra_options=${RESKEY_extra_options_default}} ####################################################################### usage() { cat < 1.0 Resource agent for the Kamailio SIP proxy/registrar. Multiple instances are possible when using following parameter combinations: Parameters for Kamailio instance 1: listen_address=192.168.159.128 monitoring_ip=192.168.159.128 proto=udptcp port=5060 Parameters for Kamailio instance 2: listen_address=192.168.159.128 monitoring_ip=192.168.159.128 proto=udp port=5070 conffile=/etc/kamailio/kamailio2.cfg kamctlrc="" Only one instance can be monitored via the command "kamctl monitor" because the kamctl tool of kamailio 4.x is not designed for multiple instances. Therefore, the provided kamctrlrc file path needs to be empty for instance 2, 3 ... Parameters for a third Kamailio instance: listen_address=192.168.159.128 monitoring_ip=192.168.159.128 proto=tcp port=5080 conffile=/etc/kamailio/kamailio3.cfg kamctlrc="" Resource agent for Kamailio The kamailio binary The kamailio binary The kamailio configuration file name with full path. For example, "/etc/kamailio/kamailio.cfg" , which is the default value. Make sure to use unique names in case of having multiple instances. Configuration file name with full path The kamailio PID file. The directory used must be writable by kamailio process user. Be sure to use unique name for running more than one instance. Try to use absolute path names. If empty, resource agent create a unique directory from the resource instance name for the PID file and assign it to the process user. PID file SIP IP Address of the kamailio instance used for SIP OPTIONS polling monitoring. Usually the same IP address value as for parameter listen_address should be provided. In order to respond with a 200 OK response to the SIP OOPTION requests, the kamailio.cfg file needs to contain following section: Note: The following "kamailio.cfg" code sniplet is part of an XML section. Therefore it contains two & characters, which need to be replaced with two ampersand characters within "kamailio.cfg": if (is_method("OPTIONS") && ($ru=~"sip:monitor@.*")) { ## ## If the method is an OPTIONS we are simply going to respond ## with a 200 OK. # xlog("L_INFO", "Method is an OPTIONS, probably just monitoring\n"); sl_send_reply("200", "Kamailio is alive"); exit; } Monitoring IP address used for SIP OPTIONS polling. SIP IP address the kamailio will listen on. Listening SIP address SIP port for the kamailio instance. SIP Port Extra options to add to kamailio start. extra_options The protocol used for SIP proto = udp|tcp|udptcp|conf_udp|conf_tcp|conf_udptcp. Using the options "conf_*" does not add any "-l" parameters to the kamailio command, the "listen" parameters from kamailio.conf are used instead. The sipsak checks are performed depending what protocol is defined after the underscore. protocol The installation path of the sipsak tool, which is used for monitoring Kamailio via SIP OPTIONS polling. sipsak path The installation path of the "kamctl" control tool. kamctl path The location of the "kamctlrc" file for the Kamailio instance. The file "kamctlrc" is the Kamailio configuration file for its "kamctl" control tool. This parameter only needs to be provided in case of using multiple Kamailio server instances on a single cluster node: - In case that the parameter "kamctlrc" is not empty, this ressource agent monitors + In case that the parameter "kamctlrc" is not empty, this resource agent monitors the health state of the Kamailio server via the command "kamctl monitor 1". This setting is recommended in case of using a single Kamailio server instance. - In case that the parameter "kamctlrc" is empty, the ressource agent does not + In case that the parameter "kamctlrc" is empty, the resource agent does not monitor the health state of the Kamailio server instance via the "kamctl" command. Please note that the "kamctl" control command of Kamailio 4.x does not support running multiple Kamailio instances on one host. Nevertheless this resource agent does allow multiple Kamailio instances per host. The result of the "kamctl" limitation in terms of number of Kamailio server instances is that the health check via "kamctl monitor 1" can be configured for a single Kamailio instance only. - Please refer to the long description of this resoure agent for an example + Please refer to the long description of this resource agent for an example of parameter combinations in case that multiple instances are to be configured per cluster node. kamctlrc path The user account for kamailio process to run with. Uses the current user, if not specified or empty. There is no check, if running kamailio with the specified user account is possible. kamailio user The group for kamailio process to run with. Uses the current group, if not specified or empty. kamailio group END exit $OCF_SUCCESS } ####################################################################### ### #Check if a process with given PID is running # Parameter 1: PID ### isRunning_PID() { kill -s 0 "$1" > /dev/null 2>&1 } ### #Check if an instance with given command line is running # Parameter 1: command line. ### isRunning_cmd() { pkill -s 0 "$1" > /dev/null 2>&1 } ### # Formats the result of a command. # # Parameter 1: Exit status. # Parameter 2: Standard output (stdout). # Parameter 3: Error output (stderr). # Returns: Formatted result. kamailio_format_result() { local exitstatus="$1" local value="$2" local error="$3" echo -n "exit status: ${exitstatus}" if [ -n "$value" ]; then echo -n ", value: ${value}" fi if [ -n "$error" ]; then echo -n ", error: ${error}" fi echo } ### # Put the command line, how the kamailio process is started according # to the configured parameters, into the variable "kam_cmd". ### kamailio_cmd() { case ${OCF_RESKEY_proto} in udp) listen_param="-T -l udp:${OCF_RESKEY_listen_address}:${OCF_RESKEY_port} -l udp:127.0.0.1:${OCF_RESKEY_port}" ;; tcp) listen_param="-l tcp:${OCF_RESKEY_listen_address}:${OCF_RESKEY_port} -l tcp:127.0.0.1:${OCF_RESKEY_port}" ;; udptcp) listen_param1="-l udp:${OCF_RESKEY_listen_address}:${OCF_RESKEY_port} -l udp:127.0.0.1:${OCF_RESKEY_port}" listen_param2="-l tcp:${OCF_RESKEY_listen_address}:${OCF_RESKEY_port} -l tcp:127.0.0.1:${OCF_RESKEY_port}" listen_param="${listen_param1} ${listen_param2}" ;; conf_*) # doing nothing, no listen_param set ;; *) listen_param="-T" ;; esac kam_cmd="${OCF_RESKEY_binary} -P ${OCF_RESKEY_pidfile} -f ${OCF_RESKEY_conffile}" if [ -n "${listen_param}" ]; then kam_cmd="${kam_cmd} ${listen_param}" fi if [ -n "${OCF_RESKEY_kamuser}" ]; then kam_cmd="${kam_cmd} -u ${OCF_RESKEY_kamuser}" fi if [ -n "${OCF_RESKEY_kamgroup}" ]; then kam_cmd="${kam_cmd} -g ${OCF_RESKEY_kamgroup}" fi if [ -n "${OCF_RESKEY_extra_options}" ]; then kam_cmd="${kam_cmd} ${OCF_RESKEY_extra_options}" fi } ### # Gets the PID for the running Kamailio instance. # # Returns: The variable $PID contains the found PID value or an empty string. # Exit Status: Zero if the PID file was found and this process run under # the command line parameters of our instance. # 1) if the PID file is not present and no process running under # our command line options is active. # 2) in all other fatal cases, which we classify in the followig # as OCF_ERR_genering. These are folloing cases: # a) The PID file contains a PID value which does no match to # to our instance # b) The PID contains a empty string in its first line # c) The PID file contains some text and some processeses # from our instance are still active kamailio_get_pid() { if [ -f ${OCF_RESKEY_pidfile} ]; then PID=`head -n 1 $OCF_RESKEY_pidfile` if [ ! -z "$PID" ]; then #Cross check if the PID file really contains a process of our kamailio instance: kamailio_cmd CROSSPID=`pgrep -o -f "${kam_cmd}"` if [ x"$PID" == x"$CROSSPID" ]; then #ocf_log debug "Found kamailio process PID with value: $PID." return 0 fi #ocf_log debug "PID file does not contain a PID of a $OCF_RESKEY_binary process!" return 2 fi #PID file does not contain a valid PID rm -f ${OCF_RESKEY_pidfile} return 2 fi # No PID file found! # Check if still a process exists even though we don't have the PID any longer: kamailio_cmd pgrep -f "${kam_cmd}" if [ $? -eq 0 ]; then ocf_log info "PID file does not contain a valid PID, but kamailio process is still active" return 2 fi ocf_log info "No PID file found and our kamailio instance is not active" return 1 } kamailio_status() { local not_running_log_level="warn" local errorfile error output if [ "$__OCF_ACTION" = "start" ]; then not_running_log_level="debug" fi kamailio_get_pid >/dev/null RET=$? if [ $RET -ne 0 ]; then if [ $RET -eq 2 ]; then ocf_log $not_running_log_level "PID file does not contain a PID of a ${OCF_RESKEY_binary} process!" return $OCF_ERR_GENERIC fi return $OCF_NOT_RUNNING fi PID=`head -n 1 $OCF_RESKEY_pidfile` isRunning_PID "$PID" RET=$? if [ "$RET" -ne 0 ]; then ocf_log $not_running_log_level "PID from $PID from ${OCF_RESKEY_pidfile} not running" rm -f ${OCF_RESKEY_pidfile} return $OCF_NOT_RUNNING fi rc=0 # In case that OCF_RESKEY_kamctlrc we perfom a health check via "kamctl monitor 1" if [ ! -z ${OCF_RESKEY_kamctlrc} ]; then # PID is running now but it is not save to check via kamctl without care, because # the implementation analysis in the case that we kill all running processes # shows that in case that the fifo cannot be read, then kamctl blocks. This needs # to be avoided. # In order to be on the safe side, we run this check therefore under "timeout" control: rc=1 timeout 3 ${OCF_RESKEY_kamctl} monitor 1 |grep "since" ; rc=$? fi if [ $rc -ne 0 ]; then ocf_log $not_running_log_level "Kamailio is not up according to kamctl monitor!" return $OCF_NOT_RUNNING fi errorfile=`mktemp` case ${OCF_RESKEY_proto} in udp) output=`$OCF_RESKEY_sipsak -s sip:monitor@$OCF_RESKEY_monitoring_ip:${OCF_RESKEY_port} -H localhost --transport udp>/dev/null 2>>$errorfile` result=$? ;; tcp) output=`$OCF_RESKEY_sipsak -s sip:monitor@$OCF_RESKEY_monitoring_ip:${OCF_RESKEY_port} -H localhost --transport tcp>/dev/null 2>>$errorfile` result=$? ;; udptcp) output=`$OCF_RESKEY_sipsak -s sip:monitor@$OCF_RESKEY_monitoring_ip:${OCF_RESKEY_port} -H localhost --transport tcp>/dev/null 2>>$errorfile` result=$? if [ $result -eq 0 ]; then output=`$OCF_RESKEY_sipsak -s sip:monitor@$OCF_RESKEY_monitoring_ip:${OCF_RESKEY_port} -H localhost --transport udp>/dev/null 2>>$errorfile` result=$? fi ;; *) output=`$OCF_RESKEY_sipsak -s sip:monitor@$OCF_RESKEY_monitoring_ip:${OCF_RESKEY_port} -H localhost --transport udp>/dev/null 2>>$errorfile` result=$? ;; esac error=`cat $errorfile` rm -f $errorfile if [ $result -ne 0 ]; then ocf_log $not_running_log_level "Kamailio is running, but not functional as sipsak ${OCF_RESKEY_proto} failed with $(kamailio_format_result $result "$output" "$error")" return $OCF_ERR_GENERIC fi return $OCF_SUCCESS } kamailio_monitor() { kamailio_status } kamailio_start() { local errorfile error output piddir if kamailio_status then ocf_log info "kamailio already running." return $OCF_SUCCESS fi # if pidfile directory does not exist, create it with kamailio process owner piddir=`dirname "${OCF_RESKEY_pidfile}"` if [ ! -d "$piddir" ]; then mkdir -p "$piddir" if [ "$OCF_RESKEY_kamuser" != "" ]; then chown ${OCF_RESKEY_kamuser} "$piddir" fi fi kamailio_cmd if [ "$OCF_RESKEY_kamuser" != "" ]; then kam_cmd="su -s @BASH_SHELL@ $OCF_RESKEY_kamuser -c \"$kam_cmd\"" fi ocf_log info "start kamailio with $kam_cmd." errorfile=`mktemp` output=$(eval ${kam_cmd} 2>>$errorfile) result=$? error=`cat $errorfile` rm -f $errorfile if [ $result -eq 0 ]; then result=1 while [ $result -ne 0 ]; do sleep 1 kamailio_get_pid >/dev/null result=$? done ocf_log info "kamailio instance PID=$PID started." # check with monitor operation if running correctly result=$OCF_ERR_GENERIC while [ $result -ne $OCF_SUCCESS ]; do sleep 1 kamailio_monitor result=$? ocf_log info "monitor in start returned $result" done ocf_log info "kamailio started successful." else ocf_log err "kamailio instance could not be started, $(kamailio_format_result $result "$output" "$error")" result=$OCF_ERR_GENERIC fi return $result } kamailio_stop() { local piddir local TRIES=0 result=$OCF_SUCCESS kamailio_cmd ocf_log info "Stopping kamailio by sending SIGTERM to ${kam_cmd}" pkill -SIGTERM -x -f "${kam_cmd}" if [ $? -eq 1 ]; then # already stopped. no processes found # in case of not specified pidfile, delete the created directory # otherwise only the pidfile itself if [ "${OCF_RESKEY_pidfile}" == "${RESKEY_pidfile_default}" ]; then piddir=`dirname "${OCF_RESKEY_pidfile}"` rm -rf "$piddir" else rm -f "${OCF_RESKEY_pidfile}" fi return $result fi if [ "$OCF_RESKEY_CRM_meta_timeout" != "" ]; then KAMAILIO_STOP_TIMEOUT=$(( ($OCF_RESKEY_CRM_meta_timeout/1000) - 7 )) else KAMAILIO_STOP_TIMEOUT=20 fi while isRunning_cmd "${kam_cmd}" && [ "$TRIES" -lt "${KAMAILIO_STOP_TIMEOUT}" ] do sleep 1 ocf_log info "kamailio ${kam_cmd} is still running after SIGTERM" ((TRIES++)) done isRunning_cmd "${kam_cmd}" RET=$? if [ "$RET" -eq 0 ]; then ocf_log info "Killing ${kam_cmd} with SIGKILL" TRIES=0 pkill -SIGKILL -x -f "${kam_cmd}" > /dev/null 2>&1 while isRunning_cmd "${kam_cmd}" && [ "$TRIES" -lt 3 ] do sleep 1 ocf_log info "kamailio ${kam_cmd} is still running after SIGKILL" ((TRIES++)) done isRunning_cmd "${kam_cmd}" RET=$? if [ "$RET" -eq 0 ]; then ocf_log fatal "kamailio is still running even after SIGKILL" result=$OCF_ERR_GENERIC fi else ocf_log info "${kam_cmd} has stopped." fi # in case of not specified pidfile, delete the created directory # otherwise only the pidfile itself if [ "${OCF_RESKEY_pidfile}" == "${RESKEY_pidfile_default}" ]; then piddir=`dirname "${OCF_RESKEY_pidfile}"` rm -rf "$piddir" else rm -f "${OCF_RESKEY_pidfile}" fi return $result } kamailio_validate_all() { # Check if kamailio configuration is valid before starting the server if [ ! -f $OCF_RESKEY_binary ]; then ocf_log err "File OCF_RESKEY_binary [${OCF_RESKEY_binary}] does not exist!" return $OCF_NOT_INSTALLED fi out=$($OCF_RESKEY_binary -c 2>&1 > /dev/null) retcode=$? if [ "$retcode" -ne '0' ]; then ocf_log info "Not starting kamailio: $OCF_RESKEY_binary does not start!" return $OCF_ERR_CONFIGURED fi case $OCF_RESKEY_monitoring_ip in "") ocf_log err "Required parameter OCF_RESKEY_monitoring_ip is missing!" return $OCF_ERR_CONFIGURED ;; [0-9]*.[0-9]*.[0-9]*.[0-9]*) : OK ;; *) ocf_log err "Parameter OCF_RESKEY_monitoring_ip [$OCF_RESKEY_monitoring_ip] is not an IP address!" return $OCF_ERR_CONFIGURED ;; esac case $OCF_RESKEY_listen_address in "") ocf_log err "Required parameter $OCF_RESKEY_listen_address is missing!" return $OCF_ERR_CONFIGURED ;; [0-9]*.[0-9]*.[0-9]*.[0-9]*) : OK ;; *) ocf_log err "Parameter OCF_RESKEY_listen_address [$OCF_RESKEY_listen_address] not an IP address!" return $OCF_ERR_CONFIGURED ;; esac if [ ! -f ${OCF_RESKEY_sipsak} ]; then ocf_log err "sipsak [${OCF_RESKEY_sipsak}] does not exist!" return $OCF_NOT_INSTALLED fi if [ ! -z ${OCF_RESKEY_kamctlrc} ]; then if [ ! -f ${OCF_RESKEY_kamctlrc} ]; then ocf_log err "kamctlrc file [${kamctlrc}] does not exist!" return $OCF_NOT_INSTALLED fi else ocf_log debug "No monitoring via kamctl monitor because the parameter [kamctlrc] is empty." fi if [ ! -f ${OCF_RESKEY_conffile} ]; then ocf_log err "Kamailio configuration file provided in the parameter conffile [${OCF_RESKEY_conffile}] does not exist!" return $OCF_ERR_CONFIGURED fi case $OCF_RESKEY_proto in "") ocf_log err "Parameter $OCF_RESKEY_proto is empty!" return $OCF_ERR_CONFIGURED ;; udp|tcp|udptcp) : OK ;; *) ocf_log err "Parameter value $OCF_RESKEY_proto for parameter [proto] not yet supported!" return $OCF_ERR_CONFIGURED ;; esac return $OCF_SUCCESS } if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi case $__OCF_ACTION in meta-data) meta_data exit $OCF_SUCCESS ;; start|stop|status|monitor) kamailio_${__OCF_ACTION} ;; validate-all) kamailio_validate_all ;; notify) exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; # reload) #Not supported by Kamailio, but not needed by pacemaker # ;; # recover #Not needed by pacemaker # ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/ovsmonitor b/heartbeat/ovsmonitor index 000854bcd..120977462 100755 --- a/heartbeat/ovsmonitor +++ b/heartbeat/ovsmonitor @@ -1,450 +1,450 @@ #!/bin/sh # # OCF Resource Agent compliant script. # Monitor the vitality of a local OpenVSwitch bond. # # Based on the work by Alexander Krauth. # # Transfered from ethmonitor into ovsmonitor by Mathieu Grzybek. # # Copyright (c) 2017 Robert Euhus, Alexander Krauth, Lars Marowsky-Bré # Mathieu Grzybek # All Rights Reserved. # # This program is free software; you can redistribute it and/or modify # it under the terms of version 2 of the GNU General Public License as # published by the Free Software Foundation. # # This program is distributed in the hope that it would be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # Further, this software is distributed without any warranty that it is # free of the rightful claim of any third person regarding infringement # or the like. Any license provided herein, whether implied or # otherwise, applies only to this software file. Patent licenses, if # any, provided herein do not apply to combinations of this program with # other software, or any other product whatsoever. # # You should have received a copy of the GNU General Public License # along with this program; if not, write the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. # # OCF parameters are as below # # OCF_RESKEY_bond # OCF_RESKEY_bridge # OCF_RESKEY_multiplicator # OCF_RESKEY_name # OCF_RESKEY_repeat_count # OCF_RESKEY_repeat_interval # OCF_RESKEY_pktcnt_timeout # ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### meta_data() { cat < 0.1 Monitor the vitality of a local ovs bond. You may set up this RA as a clone resource to monitor the network bonds on different nodes, with the same bond name. This is not related to the IP address or the network on which a bond is configured. You may use this RA to move resources away from a node, which has a faulty bond or prevent moving resources to such a node. -This gives you independend control of the resources, without involving cluster intercommunication. But it requires your nodes to have more than one network bond. +This gives you independent control of the resources, without involving cluster intercommunication. But it requires your nodes to have more than one network bond. The resource configuration requires a monitor operation, because the monitor does the main part of the work. In addition to the resource configuration, you need to configure some location constraints, based on a CIB attribute value. The name of the attribute value is configured in the 'name' option of this RA. Example constraint configuration using crmsh location loc_connected_node my_resource_grp \ rule $id="rule_loc_connected_node" -INF: ovsmonitor-bond-public eq 0 Example constraint configuration using pcs. Only allow 'my_resource' to run on nodes where eth0 ethernet device is available. pcs constraint location my_resource rule score=-INFINITY ovsmonitor-bond-public ne 1 The ethmonitor works in 3 different modes to test the bond vitality. 1. call ovs-appctl to see if at least one of the bonding's link status is up (if link is down -> error) 2. call ovs-ofctl and watch the RX counter (if packages come around in a certain time -> success) 3. return error Monitors ovs bonding bonds The name of the network bond which should be monitored (e.g. bond-public). Bond bond name The name of the ovs bridge that contains the bridge. ovs bridge The name of the CIB attribute to set. This is the name to be used in the constraints. Defaults to "ethmonitor-'bond_name'". Attribute name Multiplier for the value of the CIB attriobute specified in parameter name. Multiplier for result variable Specify how often the bond will be monitored, before the status is set to failed. You need to set the timeout of the monitoring operation to at least repeat_count * repeat_interval Monitor repeat count Specify how long to wait in seconds between the repeat_counts. Monitor repeat interval in seconds Timeout for the RX packet counter. Stop listening for packet counter changes after the given number of seconds. packet counter timeout Only report success based on link status. Do not perform RX counter related connectivity tests. link status check only END exit $OCF_SUCCESS } # # Return true, if the bond exists # is_bond() { # # List bonds but exclude FreeS/WAN ipsecN virtual bonds # ovs-appctl bond/show $OCF_RESKEY_bond 1>/dev/null 2>&1 } # # Return true, if the bridge exists # is_bridge() { # # List bonds but exclude FreeS/WAN ipsecN virtual bonds # #ovs-appctl bond/show $OCF_RESKEY_bond 1>/dev/null 2>&1 ovs-vsctl show|grep Bridge|grep -q $OCF_RESKEY_bridge } if_init() { local rc if [ X"$OCF_RESKEY_bond" = "X" ]; then ocf_exit_reason "Bond name (the bond parameter) is mandatory" exit $OCF_ERR_CONFIGURED fi if [ X"$OCF_RESKEY_bridge" = "X" ]; then ocf_exit_reason "Bridge name (the bridge parameter) is mandatory" exit $OCF_ERR_CONFIGURED fi BOND="$OCF_RESKEY_bond" BRIDGE="$OCF_RESKEY_bridge" if is_bond then if ! is_bridge then ocf_exit_reason "Bridge $OCF_RESKEY_bond does not exist" exit $OCF_ERR_CONFIGURED; fi else ocf_exit_reason "Bond $OCF_RESKEY_bond does not exist" exit $OCF_ERR_CONFIGURED; fi : ${OCF_RESKEY_multiplier:="1"} if ! ocf_is_decimal "$OCF_RESKEY_multiplier"; then ocf_exit_reason "Invalid OCF_RESKEY_multiplier [$OCF_RESKEY_multiplier]" exit $OCF_ERR_CONFIGURED fi ATTRNAME=${OCF_RESKEY_name:-"ovsmonitor-$BOND"} REP_COUNT=${OCF_RESKEY_repeat_count:-5} if ! ocf_is_decimal "$REP_COUNT" -o [ $REP_COUNT -lt 1 ]; then ocf_exit_reason "Invalid OCF_RESKEY_repeat_count [$REP_COUNT]" exit $OCF_ERR_CONFIGURED fi REP_INTERVAL_S=${OCF_RESKEY_repeat_interval:-10} if ! ocf_is_decimal "$REP_INTERVAL_S"; then ocf_exit_reason "Invalid OCF_RESKEY_repeat_interval [$REP_INTERVAL_S]" exit $OCF_ERR_CONFIGURED fi : ${OCF_RESKEY_pktcnt_timeout:="5"} if ! ocf_is_decimal "$OCF_RESKEY_pktcnt_timeout"; then ocf_exit_reason "Invalid OCF_RESKEY_pktcnt_timeout [$OCF_RESKEY_pktcnt_timeout]" exit $OCF_ERR_CONFIGURED fi return $OCF_SUCCESS } # get the link status on $BOND # asks ip about running (up) bonds, returns the number of matching bond names that are up get_link_status () { #$IP2UTIL -o link show up dev "$BOND" | grep -v 'NO-CARRIER' | grep -c "$BOND" ovs-appctl bond/show "$BOND"|awk -F: '/^slave/ {print $2}'|grep -c enabled } # returns the number of received rx packets on $BOND get_rx_packets () { ocf_log debug "bond $BOND - bridge $BRIDGE" #$IP2UTIL -o -s link show dev "$BOND" \ # | sed 's/.* RX: [^0-9]*[0-9]* *\([0-9]*\) .*/\1/' local ovs_port for ovs_port in $(ovs-appctl bond/show $BOND|awk '/^slave/ {gsub(":","");print $2}') ; do ovs-ofctl dump-ports $BRIDGE $ovs_port done \ | awk -F, 'BEGIN{total=0} /rx/ {gsub(".*pkts=","");total=total+int($1)} END{print total}' } # watch for packet counter changes for max. OCF_RESKEY_pktcnt_timeout seconds # returns immedeately with return code 0 if any packets were received # otherwise 1 is returned watch_pkt_counter () { local RX_PACKETS_NEW local RX_PACKETS_OLD RX_PACKETS_OLD="`get_rx_packets`" for n in `seq $(( $OCF_RESKEY_pktcnt_timeout * 10 ))`; do sleep 0.1 RX_PACKETS_NEW="`get_rx_packets`" ocf_log debug "RX_PACKETS_OLD: $RX_PACKETS_OLD RX_PACKETS_NEW: $RX_PACKETS_NEW" if [ "$RX_PACKETS_OLD" -ne "$RX_PACKETS_NEW" ]; then ocf_log debug "we received some packets." return 0 fi done return 1 } # # Check the bond depending on the level given as parameter: $OCF_RESKEY_check_level # # 10: watch for packet counter changes # # # 30: watch for packet counter changes in promiscios mode # # If unsuccessfull in levels 18 and above, # the tests for higher check levels are run. # if_check () { # always check link status first link_status="`get_link_status`" ocf_log debug "link_status: $link_status (up > 0, down = 0)" if [ $link_status -eq 0 ]; then ocf_log notice "link_status: DOWN" return $OCF_NOT_RUNNING fi # if using link_status_only, skip RX count related test if ocf_is_true "$OCF_RESKEY_link_status_only"; then return $OCF_SUCCESS fi # watch for packet counter changes ocf_log debug "watch for packet counter changes" watch_pkt_counter if [ $? -eq 0 ]; then return $OCF_SUCCESS else ocf_log debug "No packets received during packet watch timeout" fi # watch for packet counter changes in promiscios mode # ocf_log debug "watch for packet counter changes in promiscios mode" # be sure switch off promiscios mode in any case # TODO: check first, wether promisc is already on and leave it untouched. # trap "$IP2UTIL link set dev $BOND promisc off; exit" INT TERM EXIT # $IP2UTIL link set dev $BOND promisc on # watch_pkt_counter && return $OCF_SUCCESS # $IP2UTIL link set dev $BOND promisc off # trap - INT TERM EXIT # looks like it's not working (for whatever reason) return $OCF_NOT_RUNNING } ####################################################################### if_usage() { cat < /dev/null` sleep $sleep_time 2> /dev/null runs=$(($runs + 1)) fi if [ $mon_rc -eq $OCF_SUCCESS -a $runs -ne 0 ]; then ocf_log info "Monitoring of $OCF_RESOURCE_INSTANCE recovered from error" fi done ocf_log debug "Monitoring return code: $mon_rc" if [ $mon_rc -eq $OCF_SUCCESS ]; then set_cib_value 1 attr_rc=$? else ocf_log err "Monitoring of $OCF_RESOURCE_INSTANCE failed." set_cib_value 0 attr_rc=$? fi ## The resource should not fail, if the bond is down. It should fail, if the update of the CIB variable has errors. ## To react on the bond failure you must use constraints based on the CIB variable value, not on the resource itself. exit $attr_rc } if_stop() { attrd_updater -D -n $ATTRNAME ha_pseudo_resource $OCF_RESOURCE_INSTANCE stop } if_start() { local rc ha_pseudo_resource $OCF_RESOURCE_INSTANCE start rc=$? if [ $rc -ne $OCF_SUCCESS ]; then ocf_exit_reason "Failure to create ovsmonitor state file" return $rc fi # perform the first monitor during the start operation if_monitor return $? } if_validate() { check_binary ovs-vsctl check_binary ovs-appctl check_binary ovs-ofctl if_init } case $__OCF_ACTION in meta-data) meta_data ;; usage|help) if_usage exit $OCF_SUCCESS ;; esac if_validate case $__OCF_ACTION in start) if_start exit $? ;; stop) if_stop exit $? ;; monitor|status) if_monitor exit $? ;; validate-all) exit $? ;; *) if_usage exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/heartbeat/portblock b/heartbeat/portblock index 0d27891e9..c1a5759c6 100755 --- a/heartbeat/portblock +++ b/heartbeat/portblock @@ -1,566 +1,566 @@ #!/bin/sh # # portblock: iptables temporary portblocking control # # Author: Sun Jiang Dong (initial version) # Philipp Reisner (per-IP filtering) # # License: GNU General Public License (GPL) # # Copyright: (C) 2005 International Business Machines # # OCF parameters are as below: # OCF_RESKEY_protocol # OCF_RESKEY_portno # OCF_RESKEY_action # OCF_RESKEY_ip # OCF_RESKEY_tickle_dir # OCF_RESKEY_sync_script ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs # Defaults OCF_RESKEY_ip_default="0.0.0.0/0" OCF_RESKEY_reset_local_on_unblock_stop_default="false" : ${OCF_RESKEY_ip=${OCF_RESKEY_ip_default}} : ${OCF_RESKEY_reset_local_on_unblock_stop=${OCF_RESKEY_reset_local_on_unblock_stop_default}} ####################################################################### CMD=`basename $0` TICKLETCP=$HA_BIN/tickle_tcp usage() { cat <&2 usage: $CMD {start|stop|status|monitor|meta-data|validate-all} $CMD is used to temporarily block ports using iptables. It can be used to blackhole a port before bringing up an IP address, and enable it after a service is started. To do that for samba, the following can be used: crm configure < 1.0 Resource script for portblock. It is used to temporarily block ports using iptables. In addition, it may allow for faster TCP reconnects for clients on failover. Use that if there are long lived TCP connections to an HA service. This feature is enabled by setting the tickle_dir parameter and only in concert with action set to unblock. Note that the tickle ACK function is new as of version 3.0.2 and hasn't yet seen widespread use. Block and unblocks access to TCP and UDP ports The protocol used to be blocked/unblocked. protocol The port number used to be blocked/unblocked. portno The action (block/unblock) to be done on the protocol::portno. action If for some reason the long lived server side TCP sessions won't be cleaned up by a reconfiguration/flush/stop of whatever services this portblock protects, they would linger in the connection table, even after the IP is gone -and services have been switched over to an other node. +and services have been switched over to another node. An example would be the default NFS kernel server. These "known" connections may seriously confuse and delay a later switchback. Enabling this option will cause this agent to try to get rid of these connections by injecting a temporary iptables rule to TCP-reset outgoing packets from the blocked ports, and additionally tickle them locally, just before it starts to DROP incoming packets on "unblock stop". (try to) reset server TCP sessions when unblock stops The IP address used to be blocked/unblocked. ip The shared or local directory (_must_ be absolute path) which stores the established TCP connections. Tickle directory If the tickle_dir is a local directory, then the TCP connection state file has to be replicated to other nodes in the cluster. It can be csync2 (default), some wrapper of rsync, or whatever. It takes the file name as a single argument. For csync2, set it to "csync2 -xv". Connection state file synchronization script END } # # Because this is the normal usage, we consider "block" # resources to be pseudo-resources -- that is, their status can't # be reliably determined through external means. # This is because we expect an "unblock" resource to come along # and disable us -- but we're still in some sense active... # #active_grep_pat {udp|tcp} portno,portno active_grep_pat() { w="[ ][ ]*" any="0\\.0\\.0\\.0/0" echo "^DROP${w}${1}${w}--${w}${any}${w}${3}${w}multiport${w}dports${w}${2}\>" } #chain_isactive {udp|tcp} portno,portno ip chain_isactive() { PAT=`active_grep_pat "$1" "$2" "$3"` $IPTABLES $wait -n -L INPUT | grep "$PAT" >/dev/null } save_tcp_connections() { [ -z "$OCF_RESKEY_tickle_dir" ] && return statefile=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip if [ -z "$OCF_RESKEY_sync_script" ]; then netstat -tn |awk -F '[:[:space:]]+' ' $8 == "ESTABLISHED" && $4 == "'$OCF_RESKEY_ip'" \ {printf "%s:%s\t%s:%s\n", $4,$5, $6,$7}' | dd of="$statefile".new conv=fsync status=none && mv "$statefile".new "$statefile" else netstat -tn |awk -F '[:[:space:]]+' ' $8 == "ESTABLISHED" && $4 == "'$OCF_RESKEY_ip'" \ {printf "%s:%s\t%s:%s\n", $4,$5, $6,$7}' \ > $statefile $OCF_RESKEY_sync_script $statefile > /dev/null 2>&1 & fi } tickle_remote() { [ -z "$OCF_RESKEY_tickle_dir" ] && return echo 1 > /proc/sys/net/ipv4/tcp_tw_recycle f=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip [ -r $f ] || return $TICKLETCP -n 3 < $f } tickle_local() { [ -z "$OCF_RESKEY_tickle_dir" ] && return f=$OCF_RESKEY_tickle_dir/$OCF_RESKEY_ip [ -r $f ] || return # swap "local" and "remote" address, # so we tickle ourselves. # We set up a REJECT with tcp-reset before we do so, so we get rid of # the no longer wanted potentially long lived "ESTABLISHED" connection # entries on the IP we are going to delet in a sec. These would get in # the way if we switch-over and then switch-back in quick succession. local i awk '{ print $2, $1; }' $f | $TICKLETCP netstat -tn | grep -Fw $OCF_RESKEY_ip || return for i in 0.1 0.5 1 2 4 ; do sleep $i awk '{ print $2, $1; }' $f | $TICKLETCP netstat -tn | grep -Fw $OCF_RESKEY_ip || break done } SayActive() { echo "$CMD DROP rule for INPUT chain [$*] is running (OK)" } SayConsideredActive() { echo "$CMD DROP rule for INPUT chain [$*] considered to be running (OK)" } SayInactive() { echo "$CMD DROP rule for INPUT chain [$*] is inactive" } #IptablesStatus {udp|tcp} portno,portno ip {block|unblock} IptablesStatus() { local rc rc=$OCF_ERR_GENERIC activewords="$CMD $1 $2 is running (OK)" if chain_isactive "$1" "$2" "$3"; then case $4 in block) SayActive $* rc=$OCF_SUCCESS ;; *) SayInactive $* rc=$OCF_NOT_RUNNING ;; esac else case $4 in block) if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then SayConsideredActive $* rc=$OCF_SUCCESS else SayInactive $* rc=$OCF_NOT_RUNNING fi ;; *) if ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" status; then SayActive $* #This is only run on real monitor events. save_tcp_connections rc=$OCF_SUCCESS else SayInactive $* rc=$OCF_NOT_RUNNING fi ;; esac fi return $rc } #IptablesBLOCK {udp|tcp} portno,portno ip IptablesBLOCK() { local rc=0 local try_reset=false if [ "$1/$4/$__OCF_ACTION" = tcp/unblock/stop ] && ocf_is_true $reset_local_on_unblock_stop then try_reset=true fi if chain_isactive "$1" "$2" "$3" then : OK -- chain already active else if $try_reset ; then $IPTABLES $wait -I OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset tickle_local fi $IPTABLES $wait -I INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP rc=$? if $try_reset ; then $IPTABLES $wait -D OUTPUT -p "$1" -s "$3" -m multiport --sports "$2" -j REJECT --reject-with tcp-reset fi fi return $rc } #IptablesUNBLOCK {udp|tcp} portno,portno ip IptablesUNBLOCK() { if chain_isactive "$1" "$2" "$3" then $IPTABLES $wait -D INPUT -p "$1" -d "$3" -m multiport --dports "$2" -j DROP else : Chain Not active fi return $? } #IptablesStart {udp|tcp} portno,portno ip {block|unblock} IptablesStart() { ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" start case $4 in block) IptablesBLOCK "$@";; unblock) IptablesUNBLOCK "$@" rc=$? tickle_remote #ignore run_tickle_tcp exit code! return $rc ;; *) usage; return 1; esac return $? } #IptablesStop {udp|tcp} portno,portno ip {block|unblock} IptablesStop() { ha_pseudo_resource "${OCF_RESOURCE_INSTANCE}" stop case $4 in block) IptablesUNBLOCK "$@";; unblock) save_tcp_connections IptablesBLOCK "$@" ;; *) usage; return 1;; esac return $? } # # Check if the port is valid, this function code is not decent, but works # CheckPort() { # Examples of valid port: "1080", "1", "0080" # Examples of invalid port: "1080bad", "0", "0000", "" echo $1 |egrep -qx '[0-9]+(:[0-9]+)?(,[0-9]+(:[0-9]+)?)*' } IptablesValidateAll() { check_binary $IPTABLES case $protocol in tcp|udp) ;; *) ocf_log err "Invalid protocol $protocol!" exit $OCF_ERR_CONFIGURED ;; esac if CheckPort "$portno"; then : else ocf_log err "Invalid port number $portno!" exit $OCF_ERR_CONFIGURED fi if [ -n "$OCF_RESKEY_tickle_dir" ]; then if [ x"$action" != x"unblock" ]; then ocf_log err "Tickles are only useful with action=unblock!" exit $OCF_ERR_CONFIGURED fi if [ ! -d "$OCF_RESKEY_tickle_dir" ]; then ocf_log err "The tickle dir doesn't exist!" exit $OCF_ERR_INSTALLED fi fi case $action in block|unblock) ;; *) ocf_log err "Invalid action $action!" exit $OCF_ERR_CONFIGURED ;; esac if ocf_is_true $reset_local_on_unblock_stop; then if [ $action != unblock ] ; then ocf_log err "reset_local_on_unblock_stop is only relevant with action=unblock" exit $OCF_ERR_CONFIGURED fi if [ -z $OCF_RESKEY_tickle_dir ] ; then ocf_log warn "reset_local_on_unblock_stop works best with tickle_dir enabled as well" fi fi return $OCF_SUCCESS } if ( [ $# -ne 1 ] ) then usage exit $OCF_ERR_ARGS fi case $1 in meta-data) meta_data exit $OCF_SUCCESS ;; usage) usage exit $OCF_SUCCESS ;; *) ;; esac if [ -z "$OCF_RESKEY_protocol" ]; then ocf_log err "Please set OCF_RESKEY_protocol" exit $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_portno" ]; then ocf_log err "Please set OCF_RESKEY_portno" exit $OCF_ERR_CONFIGURED fi if [ -z "$OCF_RESKEY_action" ]; then ocf_log err "Please set OCF_RESKEY_action" exit $OCF_ERR_CONFIGURED fi # iptables v1.4.20+ is required to use -w (wait) version=$(iptables -V | awk -F ' v' '{print $NF}') ocf_version_cmp "$version" "1.4.19.1" if [ "$?" -eq "2" ]; then wait="-w" else wait="" fi protocol=$OCF_RESKEY_protocol portno=$OCF_RESKEY_portno action=$OCF_RESKEY_action ip=$OCF_RESKEY_ip reset_local_on_unblock_stop=$OCF_RESKEY_reset_local_on_unblock_stop case $1 in start) IptablesStart $protocol $portno $ip $action ;; stop) IptablesStop $protocol $portno $ip $action ;; status|monitor) IptablesStatus $protocol $portno $ip $action ;; validate-all) IptablesValidateAll ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac exit $? diff --git a/heartbeat/slapd.in b/heartbeat/slapd.in index 4366453dd..5181e8644 100644 --- a/heartbeat/slapd.in +++ b/heartbeat/slapd.in @@ -1,577 +1,577 @@ #!@BASH_SHELL@ # # Stand-alone LDAP Daemon (slapd) # # Description: Manages Stand-alone LDAP Daemon (slapd) as an OCF resource in # an high-availability setup. # # Authors: Jeroen Koekkoek # nozawat@gmail.com # John Keith Hohm # # License: GNU General Public License (GPL) # Copyright: (C) 2011 Pagelink B.V. # # The OCF code was inspired by the Postfix resource script written by # Raoul Bhatia . # # The code for managing the slapd instance is based on the the slapd init # script found in Debian GNU/Linux 6.0. # # OCF parameters: # OCF_RESKEY_slapd # OCF_RESKEY_ldapsearch # OCF_RESKEY_config # OCF_RESKEY_pidfile # OCF_RESKEY_user # OCF_RESKEY_group # OCF_RESKEY_services # OCF_RESKEY_watch_suffix # OCF_RESKEY_ignore_suffix # OCF_RESKEY_bind_dn # OCF_RESKEY_password # OCF_RESKEY_parameters # OCF_RESKEY_stop_escalate # OCF_RESKEY_maxfiles # ################################################################################ # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs : ${OCF_RESKEY_slapd="/usr/sbin/slapd"} : ${OCF_RESKEY_ldapsearch="ldapsearch"} : ${OCF_RESKEY_config=""} : ${OCF_RESKEY_pidfile=""} : ${OCF_RESKEY_user=""} : ${OCF_RESKEY_group=""} : ${OCF_RESKEY_services="ldap:///"} : ${OCF_RESKEY_watch_suffix=""} : ${OCF_RESKEY_ignore_suffix=""} : ${OCF_RESKEY_bind_dn=""} : ${OCF_RESKEY_password=""} : ${OCF_RESKEY_parameters=""} : ${OCF_RESKEY_stop_escalate=15} : ${OCF_RESKEY_maxfiles=""} USAGE="Usage: $0 {start|stop|status|monitor|validate-all|meta-data}" ORIG_IFS=$IFS NEWLINE=' ' ################################################################################ usage() { echo $USAGE >&2 } meta_data() { cat < 0.1 Resource script for Stand-alone LDAP Daemon (slapd). It manages a slapd instance as an OCF resource. Manages a Stand-alone LDAP Daemon (slapd) instance Full path to the slapd binary. For example, "/usr/sbin/slapd". Full path to slapd binary Full path to the ldapsearch binary. For example, "/usr/bin/ldapsearch". Full path to ldapsearch binary Full path to a slapd configuration directory or a slapd configuration file. For example, "/etc/ldap/slapd.d" or "/etc/ldap/slapd.conf". Full path to configuration directory or file File to read the PID from; read from olcPidFile/pidfile in config if not set. File to read PID from User name or id slapd will run with. The group id is also changed to this user's gid, unless the group parameter is used to override. User name or id slapd will run with Group name or id slapd will run with. Group name or id slapd will run with LDAP (and other scheme) URLs slapd will serve. For example, "ldap://127.0.0.1:389 ldaps:/// ldapi:///" LDAP (and other scheme) URLs to serve Suffix (database backend) that will be monitored for availability. Multiple -suffixes can be specified by providing a space seperated list. By providing one +suffixes can be specified by providing a space separated list. By providing one or more suffixes here, the ignore_suffix parameter is discarded. All suffixes will be monitored if left blank. Suffix that will be monitored for availability. Suffix (database backend) that will not be monitored for availability. Multiple -suffixes can be specified by providing a space seperated list. No suffix will +suffixes can be specified by providing a space separated list. No suffix will be excluded if left blank. Suffix that will not be monitored for availability. Distinguished Name used to bind to the LDAP directory for testing. Leave blank to bind to the LDAP directory anonymously. Distinguished Name used to bind to the LDAP directory for testing. Password used to bind to the LDAP directory for testing. Password used to bind to the LDAP directory for testing. slapd may be called with additional parameters. Specify any of them here. Any additional parameters to slapd. Number of seconds to wait for shutdown (using SIGTERM) before resorting to SIGKILL Seconds before stop escalation to KILL Maximum number of open files (for ulimit -n) Max open files END } watch_suffix() { local rc if [ -n "$OCF_RESKEY_watch_suffix" ]; then if echo "'$OCF_RESKEY_watch_suffix'" | grep "'$1'" >/dev/null 2>&1; then rc=0 else rc=1 fi else if echo "'$OCF_RESKEY_ignore_suffix'" | grep "'$1'" >/dev/null 2>&1; then rc=1 else rc=0 fi fi return $rc } slapd_pid() { local pid if [ -f "$pid_file" ]; then pid=`head -n 1 "$pid_file" 2>/dev/null` if [ "X$pid" != "X" ]; then echo "$pid" return $OCF_SUCCESS fi ocf_exit_reason "slapd pid file '$pid_file' empty." return $OCF_ERR_GENERIC fi ocf_log info "slapd pid file '$pid_file' does not exist." return $OCF_NOT_RUNNING } slapd_status() { local pid=$1 if ! kill -0 $pid >/dev/null 2>&1; then return $OCF_NOT_RUNNING else return $OCF_SUCCESS fi } slapd_start() { local options local reason local rc local state slapd_status `slapd_pid`; state=$? if [ $state -eq $OCF_SUCCESS ]; then ocf_log info "slapd already running." return $state elif [ $state -eq $OCF_ERR_GENERIC ]; then return $state fi options="-u $user -g $group" if [ -d "$config" ]; then options="$options -F $config" elif [ -f "$config" ]; then options="$options -f $config" else ocf_exit_reason "slapd configuration '$config' does not exist." return $OCF_ERR_INSTALLED fi if [ -n "$parameters" ]; then options="$options $parameters" fi if [ -n "$OCF_RESKEY_maxfiles" ]; then ulimit -n $OCF_RESKEY_maxfiles u_rc=$? if [ "$u_rc" -ne 0 ]; then ocf_log warn "Could not set ulimit for open files for slapd to '$OCF_RESKEY_maxfiles'" fi fi if [ -n "$services" ]; then $slapd -h "$services" $options 2>&1; rc=$? else $slapd $options 2>&1; rc=$? fi if [ $rc -ne 0 ]; then ocf_exit_reason "slapd returned error." return $OCF_ERR_GENERIC fi while true; do slapd_monitor start if [ $? = "$OCF_SUCCESS" ]; then break fi sleep 1 done ocf_log info "slapd started." return $OCF_SUCCESS } slapd_stop() { local pid local rc local state pid=`slapd_pid`; slapd_status $pid; state=$? if [ $state -eq $OCF_NOT_RUNNING ]; then ocf_log info "slapd already stopped." return $OCF_SUCCESS elif [ $state -eq $OCF_ERR_GENERIC ]; then return $state fi ocf_stop_processes TERM $OCF_RESKEY_stop_escalate $pid; rc=$? if [ $rc -eq 1 ]; then ocf_log err "cannot stop slapd." return $OCF_ERR_GENERIC fi if [ -f "$pid_file" ]; then rm -f "$pid_file" >/dev/null 2>&1 fi ocf_log info "slapd stopped." return $OCF_SUCCESS } slapd_monitor() { local options local rc local state local suffix local suffixes local err_option="-info" slapd_status `slapd_pid`; state=$? if [ $state -eq $OCF_NOT_RUNNING ]; then if [ -z "$1" ];then if ! ocf_is_probe; then ocf_exit_reason "slapd process not found." fi fi return $state elif [ $state -ne $OCF_SUCCESS ]; then ocf_exit_reason "slapd returned error." return $state fi if [ -d "$config" ]; then for suffix in `find "$config"/'cn=config' -type f -name olcDatabase* -exec \ sed -ne 's/^[[:space:]]*olcSuffix:[[:space:]]\+\(.\+\)/\1/p' {} \;` do suffix=${suffix#\"*} suffix=${suffix%\"*} if watch_suffix $suffix; then suffixes="$suffixes $suffix" fi done elif [ -f "$config" ]; then for suffix in `sed -ne 's/^[[:space:]]*suffix[[:space:]]\+\(.\+\)/\1/p' "$config"` do suffix=${suffix#\"*} suffix=${suffix%\"*} if watch_suffix $suffix; then suffixes="$suffixes $suffix" fi done else if ocf_is_probe; then ocf_log info "slapd configuration '$config' does not exist during probe." else ocf_exit_reason "slapd configuration '$config' does not exist." return $OCF_ERR_INSTALLED fi fi options="-LLL -s base -x" if [ -n "$bind_dn" ]; then options="$options -D $bind_dn -w $password" fi [ -z "$1" ] && err_option="" for suffix in $suffixes; do ocf_run -q $err_option "$ldapsearch" -H "$services" -b "$suffix" $options >/dev/null 2>&1; rc=$? case "$rc" in "0") ocf_log debug "slapd database with suffix '$suffix' reachable" ;; "49") ocf_exit_reason "slapd database with suffix '$suffix' unreachable. Invalid credentials." return $OCF_ERR_CONFIGURED ;; *) if [ -z "$1" ] || [ -n "$1" -a $rc -ne 1 ]; then ocf_exit_reason "slapd database with suffix '$suffix' unreachable. exit code ($rc)" fi state=$OCF_ERR_GENERIC ;; esac done return $state } slapd_validate_all() { check_binary "$slapd" check_binary "$ldapsearch" if [ -z "$pid_file" ]; then if [ -d "$config" ]; then pid_file=`sed -ne \ 's/^olcPidFile:[[:space:]]\+\(.\+\)[[:space:]]*/\1/p' \ "$config"/'cn=config.ldif' 2>/dev/null` elif [ -f "$config" ]; then pid_file=`sed -ne \ 's/^pidfile[[:space:]]\+\(.\+\)/\1/p' \ "$config" 2>/dev/null` else if ocf_is_probe; then ocf_log info "slapd configuration '$config' does not exist during probe." else ocf_exit_reason "slapd configuration '$config' does not exist." return $OCF_ERR_INSTALLED fi fi fi if [ -z "$user" ]; then user=`id -nu 2>/dev/null` elif ! id "$user" >/dev/null 2>&1; then ocf_exit_reason "slapd user '$user' does not exist" return $OCF_ERR_INSTALLED fi if [ -z "$group" ]; then group=`id -ng 2>/dev/null` elif ! grep "^$group:" /etc/group >/dev/null 2>&1; then ocf_exit_reason "slapd group '$group' does not exist" return $OCF_ERR_INSTALLED fi pid_dir=`dirname "$pid_file"` if [ ! -d "$pid_dir" ]; then mkdir -p "$pid_dir" chown -R "$user" "$pid_dir" chgrp -R "$group" "$pid_dir" fi return $OCF_SUCCESS } # # Main # slapd=$OCF_RESKEY_slapd ldapsearch=$OCF_RESKEY_ldapsearch config=$OCF_RESKEY_config user=$OCF_RESKEY_user group=$OCF_RESKEY_group services=$OCF_RESKEY_services bind_dn=$OCF_RESKEY_bind_dn password=$OCF_RESKEY_password parameters=$OCF_RESKEY_parameters pid_file=$OCF_RESKEY_pidfile if [ -z "$config" ]; then config_dirname="/etc/ldap" if [ -e "/etc/openldap" ]; then config_dirname="/etc/openldap" fi config="$config_dirname/slapd.conf" if [ -e "$config_dirname/slapd.d" ]; then config="$config_dirname/slapd.d" fi fi if [ $# -ne 1 ]; then usage exit $OCF_ERR_ARGS fi case $1 in meta-data) meta_data exit $OCF_SUCCESS ;; usage|help) usage exit $OCF_SUCCESS ;; esac slapd_validate_all rc=$? [ $rc -eq $OCF_SUCCESS ] || exit $rc case $1 in status) slapd_status `slapd_pid`; state=$? if [ $state -eq $OCF_SUCCESS ]; then ocf_log debug "slapd is running." elif [ $state -eq $OCF_NOT_RUNNING ]; then ocf_log debug "slapd is stopped." fi exit $state ;; start) slapd_start exit $? ;; stop) slapd_stop exit $? ;; monitor) slapd_monitor; state=$? exit $state ;; validate-all) exit $OCF_SUCCESS ;; *) usage exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/rgmanager/src/resources/SAPDatabase b/rgmanager/src/resources/SAPDatabase index 2e398c4fd..92009d1ad 100644 --- a/rgmanager/src/resources/SAPDatabase +++ b/rgmanager/src/resources/SAPDatabase @@ -1,1026 +1,1026 @@ #!/bin/sh # # SAPDatabase # # Description: Manages any type of SAP supported database instance # as a High-Availability OCF compliant resource. # # Author: Alexander Krauth, October 2006 # Support: linux@sap.com # License: GNU General Public License (GPL) # Copyright: (c) 2006, 2007 Alexander Krauth # # An example usage: # See usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_SID # OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) # OCF_RESKEY_DBTYPE # OCF_RESKEY_NETSERVICENAME (optional, non standard name of Oracle Listener) # OCF_RESKEY_DBJ2EE_ONLY (optional, default is false) # OCF_RESKEY_JAVA_HOME (optional, only needed if DBJ2EE_ONLY is true and JAVA_HOME enviroment variable is not set) # OCF_RESKEY_STRICT_MONITORING (optional, activate application level monitoring - with Oracle a failover will occur in case of an archiver stuck) # OCF_RESKEY_AUTOMATIC_RECOVER (optional, automatic startup recovery, default is false) # OCF_RESKEY_DIR_BOOTSTRAP (optional, if non standard J2EE server directory) # OCF_RESKEY_DIR_SECSTORE (optional, if non standard J2EE secure store directory) # OCF_RESKEY_DB_JARS (optional, if maintained in bootstrap.properties, mandatory for WebAS Java 7.10) # OCF_RESKEY_PRE_START_USEREXIT (optional, lists a script which can be executed before the resource is started) # OCF_RESKEY_POST_START_USEREXIT (optional, lists a script which can be executed after the resource is started) # OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be executed before the resource is stopped) # OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be executed after the resource is stopped) # # ToDo: # Remove all the database dependend stuff from the agent and use # saphostcontrol daemon as soon as SAP will release it. # ####################################################################### # Initialization: if [ -f $(dirname $0)/.ocf-shellfuncs ]; then . $(dirname $0)/.ocf-shellfuncs elif [ -f $(dirname $0)/ocf-shellfuncs ]; then LC_ALL=C LANG=C PATH=/bin:/sbin:/usr/bin:/usr/sbin export LC_ALL LANG PATH . $(dirname $0)/ocf-shellfuncs else echo Could not find ocf-shellfuncs! exit 1 fi ####################################################################### SH=/bin/sh usage() { methods=`sapdatabase_methods` methods=`echo $methods | tr ' ' '|'` cat <<-! usage: $0 ($methods) $0 manages a SAP database of any type as an HA resource. Currently Oracle, MaxDB and DB/2 UDB are supported. ABAP databases as well as JAVA only databases are supported. The 'start' operation starts the instance. The 'stop' operation stops the instance. The 'status' operation reports whether the instance is running The 'monitor' operation reports whether the instance seems to be working The 'recover' operation tries to recover the instance after a crash (instance will be stopped first!) The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports ! } meta_data() { cat < 1.92.1 Resource script for SAP databases. It manages a SAP database of any type as an HA resource. SAP database resource agent The unique SAP system identifier. e.g. P01 SAP system ID The full qualified path where to find sapstartsrv and sapcontrol. path of sapstartsrv and sapcontrol The name of the database vendor you use. Set either: ORA,DB6,ADA database vendor The Oracle TNS listener name. listener name If you do not have a ABAP stack installed in the SAP database, set this to TRUE only JAVA stack installed This is only needed if the DBJ2EE_ONLY parameter is set to true. Enter the path to the Java SDK which is used by the SAP WebAS Java Path to Java SDK This controls how the resource agent monitors the database. If set to true, it will use SAP tools to test the connect to the database. Do not use with Oracle, because it will result in unwanted failovers in case of an archiver stuck Activates application level monitoring - The SAPDatabase resource agent tries to recover a failed start attempt automaticaly one time. This is done by running a forced abort of the RDBMS and/or executing recovery commands. + The SAPDatabase resource agent tries to recover a failed start attempt automatically one time. This is done by running a forced abort of the RDBMS and/or executing recovery commands. Enable or disable automatic startup recovery The full qualified path where to find the J2EE instance bootstrap directory. e.g. /usr/sap/P01/J00/j2ee/cluster/bootstrap path to j2ee bootstrap directory The full qualified path where to find the J2EE security store directory. e.g. /usr/sap/P01/SYS/global/security/lib/tools path to j2ee secure store directory The full qualified filename of the jdbc driver for the database connection test. It will be automaticaly read from the bootstrap.properties file in Java engine 6.40 and 7.00. For Java engine 7.10 the parameter is mandatory. file name of the jdbc driver The full qualified path where to find a script or program which should be executed before this resource gets started. path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got started. path to a post-start script The full qualified path where to find a script or program which should be executed before this resource gets stopped. path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got stopped. path to a post-start script END } trap_handler() { rm -f $TEMPFILE exit $OCF_ERR_GENERIC } do_exit() { # If we've got a tempfile variable and the tempfile exists... # ... if the return code is 0 *or* the temp file is empty # remove it. if [ -n "$TEMPFILE" ] && [ -e "$TEMPFILE" ]; then if [ $1 -eq 0 ] || [ "$(stat -c %s $TEMPFILE)" = "0" ]; then rm -f $TEMPFILE fi fi exit $1 } # # listener_start: Start the given listener # listener_start() { orasid="ora`echo $SID | tr '[:upper:]' '[:lower:]'`" rc=$OCF_SUCCESS output=`echo "lsnrctl start $NETSERVICENAME" | su - $orasid 2>&1` if [ $? -eq 0 ] then ocf_log info "Oracle Listener $NETSERVICENAME started: $output" rc=$OCF_SUCCESS else ocf_log err "Oracle Listener $NETSERVICENAME start failed: $output" rc=$OCF_ERR_GENERIC fi return $rc } # # listener_stop: Stop the given listener # listener_stop() { orasid="ora`echo $SID | tr '[:upper:]' '[:lower:]'`" rc=$OCF_SUCCESS if listener_status then : listener is running, trying to stop it later... else return $OCF_SUCCESS fi output=`echo "lsnrctl stop $NETSERVICENAME" | su - $orasid 2>&1` if [ $? -eq 0 ] then ocf_log info "Oracle Listener $NETSERVICENAME stopped: $output" else ocf_log err "Oracle Listener $NETSERVICENAME stop failed: $output" rc=$OCF_ERR_GENERIC fi return $rc } # # listener_status: is the given listener running? # listener_status() { orasid="ora`echo $SID | tr '[:upper:]' '[:lower:]'`" # Note: ps cuts off it's output at column $COLUMNS, so "ps -ef" can not be used here # as the output might be to long. cnt=`ps efo args --user $orasid | grep $NETSERVICENAME | grep -c tnslsnr` if [ $cnt -eq 1 ] then rc=$OCF_SUCCESS else ocf_log info "listener process not running for $NETSERVICENAME for $SID" rc=$OCF_ERR_GENERIC fi return $rc } # # x_server_start: Start the given x_server # x_server_start() { rc=$OCF_SUCCESS output=`echo "x_server start" | su - $sidadm 2>&1` if [ $? -eq 0 ] then ocf_log info "MaxDB x_server start: $output" rc=$OCF_SUCCESS else ocf_log err "MaxDB x_server start failed: $output" rc=$OCF_ERR_GENERIC fi return $rc } # # x_server_stop: Stop the x_server # x_server_stop() { rc=$OCF_SUCCESS output=`echo "x_server stop" | su - $sidadm 2>&1` if [ $? -eq 0 ] then ocf_log info "MaxDB x_server stop: $output" else ocf_log err "MaxDB x_server stop failed: $output" rc=$OCF_ERR_GENERIC fi return $rc } # # x_server_status: is the x_server running? # x_server_status() { sdbuser=`grep "^SdbOwner" /etc/opt/sdb | awk -F'=' '{print $2}'` # Note: ps cuts off it's output at column $COLUMNS, so "ps -ef" can not be used here # as the output might be to long. cnt=`ps efo args --user $sdbuser | grep -c vserver` if [ $cnt -ge 1 ] then rc=$OCF_SUCCESS else ocf_log info "x_server process not running" rc=$OCF_ERR_GENERIC fi return $rc } # # oracle_stop: Stop the Oracle database without any condition # oracle_stop() { echo '#!/bin/sh LOG=$HOME/stopdb.log date > $LOG if [ -x "${ORACLE_HOME}/bin/sqlplus" ] then SRVMGRDBA_EXE="${ORACLE_HOME}/bin/sqlplus" else echo "Can not find executable sqlplus" >> $LOG exit 1 fi $SRVMGRDBA_EXE /NOLOG >> $LOG << ! connect / as sysdba shutdown immediate exit ! rc=$? cat $LOG exit $rc' > $TEMPFILE chmod 700 $TEMPFILE chown $sidadm $TEMPFILE su - $sidadm -c $TEMPFILE retcode=$? rm -f $TEMPFILE if [ $retcode -eq 0 ]; then sapdatabase_status if [ $? -ne $OCF_NOT_RUNNING ]; then retcode=1 fi fi return $retcode } # # maxdb_stop: Stop the MaxDB database without any condition # maxdb_stop() { # x_Server must be running to stop database x_server_status if [ $? -ne $OCF_SUCCESS ]; then x_server_start; fi if [ $DBJ2EE_ONLY -eq 1 ]; then userkey=c_J2EE else userkey=c fi echo "#!/bin/sh LOG=\$HOME/stopdb.log date > \$LOG echo \"Stop database with xuserkey >$userkey<\" >> \$LOG dbmcli -U ${userkey} db_offline >> \$LOG 2>&1 exit \$?" > $TEMPFILE chmod 700 $TEMPFILE chown $sidadm $TEMPFILE su - $sidadm -c $TEMPFILE retcode=$? rm -f $TEMPFILE if [ $retcode -eq 0 ]; then sapdatabase_status if [ $? -ne $OCF_NOT_RUNNING ]; then retcode=1 fi fi return $retcode } # # db6udb_stop: Stop the DB2/UDB database without any condition # db6udb_stop() { echo '#!/bin/sh LOG=$HOME/stopdb.log date > $LOG echo "Shut down the database" >> $LOG $INSTHOME/sqllib/bin/db2 deactivate database $DB2DBDFT |tee -a $LOG 2>&1 $INSTHOME/sqllib/adm/db2stop force |tee -a $LOG 2>&1 exit $?' > $TEMPFILE chmod 700 $TEMPFILE chown $sidadm $TEMPFILE su - $sidadm -c $TEMPFILE retcode=$? rm -f $TEMPFILE if [ $retcode -eq 0 ]; then sapdatabase_status if [ $? -ne $OCF_NOT_RUNNING ]; then retcode=1 fi fi return $retcode } # # oracle_recover: try to clean up oracle after a crash # oracle_recover() { echo '#!/bin/sh LOG=$HOME/recover.log date > $LOG echo "Logfile written by heartbeat SAPDatabase resource agent" >> $LOG if [ -x "${ORACLE_HOME}/bin/sqlplus" ] then SRVMGRDBA_EXE="${ORACLE_HOME}/bin/sqlplus" else echo "Can not find executable sqlplus" >> $LOG exit 1 fi $SRVMGRDBA_EXE /NOLOG >> $LOG << ! connect / as sysdba shutdown abort startup mount WHENEVER SQLERROR EXIT SQL.SQLCODE WHENEVER OSERROR EXIT FAILURE alter database recover automatic database; alter database open; exit ! rc=$? cat $LOG exit $rc' > $TEMPFILE chmod 700 $TEMPFILE chown $sidadm $TEMPFILE su - $sidadm -c $TEMPFILE retcode=$? rm -f $TEMPFILE return $retcode } # # maxdb_recover: try to clean up MaxDB after a crash # maxdb_recover() { # x_Server must be running to stop database x_server_status if [ $? -ne $OCF_SUCCESS ]; then x_server_start; fi if [ $DBJ2EE_ONLY -eq 1 ]; then userkey=c_J2EE else userkey=c fi echo "#!/bin/sh LOG=\$HOME/recover.log date > \$LOG echo \"Logfile written by heartbeat SAPDatabase resource agent\" >> \$LOG echo \"Cleanup database with xuserkey >$userkey<\" >> \$LOG echo \"db_stop\" >> \$LOG 2>&1 dbmcli -U ${userkey} db_stop >> \$LOG 2>&1 echo \"db_clear\" >> \$LOG 2>&1 dbmcli -U ${userkey} db_clear >> \$LOG 2>&1 echo \"db_online\" >> \$LOG 2>&1 dbmcli -U ${userkey} db_online >> \$LOG 2>&1 rc=\$? cat \$LOG exit \$rc" > $TEMPFILE chmod 700 $TEMPFILE chown $sidadm $TEMPFILE su - $sidadm -c $TEMPFILE retcode=$? rm -f $TEMPFILE return $retcode } # # db6udb_recover: try to recover DB/2 after a crash # db6udb_recover() { db2sid="db2`echo $SID | tr '[:upper:]' '[:lower:]'`" echo '#!/bin/sh LOG=$HOME/recover.log date > $LOG echo "Logfile written by heartbeat SAPDatabase resource agent" >> $LOG $INSTHOME/sqllib/bin/db2_kill >> $LOG 2>&1 $INSTHOME/sqllib/adm/db2start >> $LOG 2>&1 $INSTHOME/sqllib/bin/db2 activate database $DB2DBDFT >> $LOG 2>&1 rc=$? cat $LOG exit $rc' > $TEMPFILE chmod 700 $TEMPFILE chown $db2sid $TEMPFILE su - $db2sid -c $TEMPFILE retcode=$? rm -f $TEMPFILE return $retcode } # # methods: What methods/operations do we support? # sapdatabase_methods() { cat <<-! start stop status monitor recover validate-all methods meta-data usage ! } # # sapuserexit : Many SAP customers need some additional processes/tools to run their SAP systems. # This specialties do not allow a totally generic SAP cluster resource agent. # Someone should write a resource agent for each additional process you need, if it # is required to monitor that process within the cluster manager. To enable # you to extent this resource agent without developing a new one, this user exit # was introduced. # sapuserexit() { NAME="$1" VALUE="$2" if [ -n "$VALUE" ] then if [ -x "$VALUE" ] then ocf_log info "Calling userexit ${NAME} with customer script file ${VALUE}" eval "$VALUE" > /dev/null 2>&1 ocf_log info "Exiting userexit ${NAME} with customer script file ${VALUE}, returncode: $?" else ocf_log warn "Attribute ${NAME} is set to ${VALUE}, but this file is not executable" fi fi return 0 } # # sapdatabase_start : Start the SAP database # sapdatabase_start() { sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT" case $DBTYPE in ADA) x_server_start ;; ORA) listener_start ;; esac output=`su - $sidadm -c $SAPSTARTDB` rc=$? if [ $DBJ2EE_ONLY -eq 1 ] then sapdatabase_monitor 1 rc=$? fi if [ $rc -ne 0 -a $OCF_RESKEY_AUTOMATIC_RECOVER -eq 1 ] then ocf_log warn "SAP database $SID start failed: $output" ocf_log warn "Try to recover database $SID" output='' sapdatabase_recover rc=$? fi if [ $rc -eq 0 ] then ocf_log info "SAP database $SID started: $output" rc=$OCF_SUCCESS sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT" else ocf_log err "SAP database $SID start failed: $output" rc=$OCF_ERR_GENERIC fi return $rc } # # sapdatabase_stop: Stop the SAP database # sapdatabase_stop() { sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT" # use of the stopdb kernel script is not possible, because there are to may checks in that # script. We want to stop the database regardless of anything. #output=`su - $sidadm -c $SAPSTOPDB` case $DBTYPE in ORA) output=`oracle_stop` ;; ADA) output=`maxdb_stop` ;; DB6) output=`db6udb_stop` ;; esac if [ $? -eq 0 ] then ocf_log info "SAP database $SID stopped: $output" rc=$OCF_SUCCESS else ocf_log err "SAP database $SID stop failed: $output" rc=$OCF_ERR_GENERIC fi case $DBTYPE in ORA) listener_stop ;; ADA) x_server_stop ;; esac sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT" return $rc } # # sapdatabase_monitor: Can the given database instance do anything useful? # sapdatabase_monitor() { strict=$1 rc=$OCF_SUCCESS case $DBTYPE in ADA) x_server_status if [ $? -ne $OCF_SUCCESS ]; then x_server_start; fi ;; ORA) listener_status if [ $? -ne $OCF_SUCCESS ]; then listener_start; fi ;; esac if [ $strict -eq 0 ] then sapdatabase_status rc=$? else if [ $DBJ2EE_ONLY -eq 0 ] then output=`echo "$SAPDBCONNECT -d -w /dev/null" | su $sidadm 2>&1` if [ $? -le 4 ] then rc=$OCF_SUCCESS else rc=$OCF_NOT_RUNNING fi else MYCP="" EXECMD="" # WebAS Java 6.40+7.00 IAIK_JCE="$SECSTORE"/iaik_jce.jar IAIK_JCE_EXPORT="$SECSTORE"/iaik_jce_export.jar EXCEPTION="$BOOTSTRAP"/exception.jar LOGGING="$BOOTSTRAP"/logging.jar OPENSQLSTA="$BOOTSTRAP"/opensqlsta.jar TC_SEC_SECSTOREFS="$BOOTSTRAP"/tc_sec_secstorefs.jar JDDI="$BOOTSTRAP"/../server0/bin/ext/jdbdictionary/jddi.jar ANTLR="$BOOTSTRAP"/../server0/bin/ext/antlr/antlr.jar FRAME="$BOOTSTRAP"/../server0/bin/system/frame.jar # only start jdbcconnect when all jars available if [ -f "$EXCEPTION" -a -f "$LOGGING" -a -f "$OPENSQLSTA" -a -f "$TC_SEC_SECSTOREFS" -a -f "$JDDI" -a -f "$ANTLR" -a -f "$FRAME" -a -f "$SAPDBCONNECT" ] then MYCP=".:$FRAME:$ANTLR:$JDDI:$IAIK_JCE_EXPORT:$IAIK_JCE:$EXCEPTION:$LOGGING:$OPENSQLSTA:$TC_SEC_SECSTOREFS:$DB_JARS:$SAPDBCONNECT" EXECMD="com.sap.inst.jdbc.connect.JdbcCon -sec $SID:$SID" else # WebAS Java 7.10 LAUNCHER=${BOOTSTRAP}/sap.com~tc~bl~offline_launcher~impl.jar if [ -f "$DB_JARS" -a -f "$SAPDBCONNECT" -a -f "$LAUNCHER" ] then MYCP="$LAUNCHER" EXECMD="com.sap.engine.offline.OfflineToolStart com.sap.inst.jdbc.connect.JdbcCon ${SAPDBCONNECT}:${SECSTORE}:${DB_JARS}:${BOOTSTRAP} -sec $SID:$SID" fi fi if [ -n "$EXECMD" ] then output=`eval ${JAVA_HOME}/bin/java -cp $MYCP $EXECMD` if [ $? -le 0 ] then rc=$OCF_SUCCESS else rc=$OCF_NOT_RUNNING fi else output="Cannot find all jar files needed for database monitoring." rc=$OCF_ERR_GENERIC fi fi fi if [ $rc -ne $OCF_SUCCESS ] then ocf_log err "The SAP database $SID is not running: $output" fi return $rc } # # sapdatabase_status: Are there any database processes on this host ? # sapdatabase_status() { case $DBTYPE in ADA) SEARCH="$SID/db/pgm/kernel" SUSER=`grep "^SdbOwner" /etc/opt/sdb | awk -F'=' '{print $2}'` SNUM=2 ;; ORA) SEARCH="ora_[a-z][a-z][a-z][a-z]_" SUSER="ora`echo $SID | tr '[:upper:]' '[:lower:]'`" SNUM=4 ;; DB6) SEARCH="db2[a-z][a-z][a-z][a-z][a-z]" SUSER="db2`echo $SID | tr '[:upper:]' '[:lower:]'`" SNUM=5 ;; esac # Note: ps cuts off it's output at column $COLUMNS, so "ps -ef" can not be used here # as the output might be to long. cnt=`ps efo args --user $SUSER 2> /dev/null | grep -c "$SEARCH"` if [ $cnt -ge $SNUM ] then rc=$OCF_SUCCESS else # ocf_log info "Database Instance $SID is not running on `hostname`" rc=$OCF_NOT_RUNNING fi return $rc } # # sapdatabase_recover: # sapdatabase_recover() { case $DBTYPE in ORA) recoutput=`oracle_recover` ;; ADA) recoutput=`maxdb_recover` ;; DB6) recoutput=`db6udb_recover` ;; esac sapdatabase_monitor 1 retcode=$? if [ $retcode -eq $OCF_SUCCESS ] then ocf_log info "Recover of SAP database $SID was successful: $recoutput" else ocf_log err "Recover of SAP database $SID failed: $recoutput" fi return $retcode } # # sapdatabase_validate: Check the symantic of the input parameters # sapdatabase_validate() { rc=$OCF_SUCCESS if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ] then ocf_log err "Parsing parameter SID: '$SID' is not a valid system ID!" rc=$OCF_ERR_ARGS fi case "$DBTYPE" in ORA|ADA|DB6) ;; *) ocf_log err "Parsing parameter DBTYPE: '$DBTYPE' is not a supported database type!" rc=$OCF_ERR_ARGS ;; esac return $rc } # # 'main' starts here... # if ( [ $# -ne 1 ] ) then usage exit $OCF_ERR_ARGS fi # These operations don't require OCF instance parameters to be set case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage) usage exit $OCF_SUCCESS;; methods) sapdatabase_methods exit $?;; *);; esac # Set a tempfile and make sure to clean it up again TEMPFILE="$(mktemp /tmp/SAPDatabase.tmp.XXXXXX)" trap trap_handler INT TERM # Everything after here must call do_exit to remove temp file US=`id -u -n` US=`echo $US` if [ $US != root ] then ocf_log err "$0 must be run as root" do_exit $OCF_ERR_PERM fi # mandatory parameter check if [ -z "$OCF_RESKEY_SID" ]; then ocf_log err "Please set OCF_RESKEY_SID to the SAP system id!" do_exit $OCF_ERR_ARGS fi SID=`echo "$OCF_RESKEY_SID"` if [ -z "$OCF_RESKEY_DBTYPE" ]; then ocf_log err "Please set OCF_RESKEY_DBTYPE to the database vendor specific tag (ORA,ADA,DB6)!" do_exit $OCF_ERR_ARGS fi DBTYPE=`echo "$OCF_RESKEY_DBTYPE" | tr '[a-z]' '[A-Z]'` # optional OCF parameters, we try to guess which directories are correct EXESTARTDB="startdb" EXESTOPDB="stopdb" EXEDBCONNECT="R3trans" if [ -z "$OCF_RESKEY_DBJ2EE_ONLY" ]; then DBJ2EE_ONLY=0 else case "$OCF_RESKEY_DBJ2EE_ONLY" in 1|true|TRUE|yes|YES) DBJ2EE_ONLY=1 EXESTARTDB="startj2eedb" EXESTOPDB="stopj2eedb" EXEDBCONNECT="jdbcconnect.jar" ;; 0|false|FALSE|no|NO) DBJ2EE_ONLY=0;; *) ocf_log err "Parsing parameter DBJ2EE_ONLY: '$DBJ2EE_ONLY' is not a boolean value!" do_exit $OCF_ERR_ARGS ;; esac fi if [ -z "$OCF_RESKEY_NETSERVICENAME" ]; then case "$DBTYPE" in ORA|ora) NETSERVICENAME="LISTENER";; *) NETSERVICENAME="";; esac else NETSERVICENAME="$OCF_RESKEY_NETSERVICENAME" fi if [ -z "$OCF_RESKEY_STRICT_MONITORING" ]; then OCF_RESKEY_STRICT_MONITORING=0 else case "$OCF_RESKEY_STRICT_MONITORING" in 1|true|TRUE|yes|YES) OCF_RESKEY_STRICT_MONITORING=1;; 0|false|FALSE|no|NO) OCF_RESKEY_STRICT_MONITORING=0;; *) ocf_log err "Parsing parameter STRICT_MONITORING: '$OCF_RESKEY_STRICT_MONITORING' is not a boolean value!" do_exit $OCF_ERR_ARGS ;; esac fi PATHLIST=" $OCF_RESKEY_DIR_EXECUTABLE /usr/sap/$SID/*/exe /usr/sap/$SID/SYS/exe/run /sapmnt/$SID/exe " DIR_EXECUTABLE="" for EXEPATH in $PATHLIST do if [ -x $EXEPATH/$EXESTARTDB -a -x $EXEPATH/$EXESTOPDB -a -x $EXEPATH/$EXEDBCONNECT ] then DIR_EXECUTABLE=$EXEPATH SAPSTARTDB=$EXEPATH/$EXESTARTDB SAPSTOPDB=$EXEPATH/$EXESTOPDB SAPDBCONNECT=$EXEPATH/$EXEDBCONNECT break fi done if [ -z "$DIR_EXECUTABLE" ] then ocf_log warn "Cannot find $EXESTARTDB,$EXESTOPDB and $EXEDBCONNECT executable, please set DIR_EXECUTABLE parameter!" do_exit $OCF_NOT_RUNNING fi if [ $DBJ2EE_ONLY -eq 1 ] then if [ -n "$OCF_RESKEY_DIR_BOOTSTRAP" ] then BOOTSTRAP="$OCF_RESKEY_DIR_BOOTSTRAP" else BOOTSTRAP=`echo /usr/sap/$SID/*/j2ee/cluster/bootstrap | head -1` fi if [ -n "$OCF_RESKEY_DIR_SECSTORE" ] then SECSTORE="$OCF_RESKEY_DIR_SECSTORE" else SECSTORE=/usr/sap/$SID/SYS/global/security/lib/tools fi if [ -n "$OCF_RESKEY_JAVA_HOME" ] then JAVA_HOME="$OCF_RESKEY_JAVA_HOME" PATH=$JAVA_HOME/bin:$PATH else if [ -n "$JAVA_HOME" ] then PATH=$JAVA_HOME/bin:$PATH else ocf_log err "Cannot find JAVA_HOME directory, please set JAVA_HOME parameter!" do_exit $OCF_NOT_RUNNING fi fi if [ -n "$OCF_RESKEY_DB_JARS" ] then DB_JARS=$OCF_RESKEY_DB_JARS else if [ -f "$BOOTSTRAP"/bootstrap.properties ]; then DB_JARS=`cat $BOOTSTRAP/bootstrap.properties | grep -i rdbms.driverLocation | sed -e 's/\\\:/:/g' | awk -F= '{print $2}'` fi fi fi if [ -z "$OCF_RESKEY_AUTOMATIC_RECOVER" ] then OCF_RESKEY_AUTOMATIC_RECOVER=0 else case "$OCF_RESKEY_AUTOMATIC_RECOVER" in 1|true|TRUE|yes|YES) OCF_RESKEY_AUTOMATIC_RECOVER=1;; 0|false|FALSE|no|NO) OCF_RESKEY_AUTOMATIC_RECOVER=0;; esac fi # as root user we need the library path to the SAP kernel to be able to call executables if [ `echo $LD_LIBRARY_PATH | grep -c "^$DIR_EXECUTABLE\>"` -eq 0 ]; then LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH export LD_LIBRARY_PATH fi sidadm="`echo $SID | tr '[:upper:]' '[:lower:]'`adm" # What kind of method was invoked? case "$1" in start) sapdatabase_start do_exit $?;; stop) sapdatabase_stop do_exit $?;; monitor) sapdatabase_monitor $OCF_RESKEY_STRICT_MONITORING do_exit $?;; status) sapdatabase_status do_exit $?;; recover) sapdatabase_recover do_exit $?;; validate-all) sapdatabase_validate do_exit $?;; *) sapdatabase_methods do_exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/rgmanager/src/resources/SAPInstance b/rgmanager/src/resources/SAPInstance index 699796e81..c64e021ba 100644 --- a/rgmanager/src/resources/SAPInstance +++ b/rgmanager/src/resources/SAPInstance @@ -1,630 +1,630 @@ #!/bin/sh # # SAPInstance # # Description: Manages a single SAP Instance as a High-Availability # resource. One SAP Instance is defined by one # SAP Instance-Profile. start/stop handels all services # of the START-Profile, status and monitor care only # about essential services. # # Author: Alexander Krauth, June 2006 # Support: linux@sap.com # License: GNU General Public License (GPL) # Copyright: (c) 2006, 2007 Alexander Krauth # # An example usage: # See usage() function below for more details... # # OCF instance parameters: # OCF_RESKEY_InstanceName # OCF_RESKEY_DIR_EXECUTABLE (optional, well known directories will be searched by default) # OCF_RESKEY_DIR_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_START_PROFILE (optional, well known directories will be searched by default) # OCF_RESKEY_START_WAITTIME (optional, to solve timing problems during J2EE-Addin start) # OCF_RESKEY_AUTOMATIC_RECOVER (optional, automatic startup recovery using cleanipc, default is false) # OCF_RESKEY_PRE_START_USEREXIT (optional, lists a script which can be executed before the resource is started) # OCF_RESKEY_POST_START_USEREXIT (optional, lists a script which can be executed after the resource is started) # OCF_RESKEY_PRE_STOP_USEREXIT (optional, lists a script which can be executed before the resource is stopped) # OCF_RESKEY_POST_STOP_USEREXIT (optional, lists a script which can be executed after the resource is stopped) # ####################################################################### # Initialization: if [ -f $(dirname $0)/.ocf-shellfuncs ]; then . $(dirname $0)/.ocf-shellfuncs elif [ -f $(dirname $0)/ocf-shellfuncs ]; then LC_ALL=C LANG=C PATH=/bin:/sbin:/usr/bin:/usr/sbin export LC_ALL LANG PATH . $(dirname $0)/ocf-shellfuncs else echo Could not find ocf-shellfuncs! exit 1 fi ####################################################################### SH=/bin/sh usage() { methods=`sapinstance_methods` methods=`echo $methods | tr ' ' '|'` cat <<-! usage: $0 ($methods) $0 manages a SAP Instance as an HA resource. The 'start' operation starts the instance. The 'stop' operation stops the instance. The 'status' operation reports whether the instance is running The 'monitor' operation reports whether the instance seems to be working The 'validate-all' operation reports whether the parameters are valid The 'methods' operation reports on the methods $0 supports ! } meta_data() { cat < 1.92.1 Resource script for SAP. It manages a SAP Instance as an HA resource. SAP instance resource agent The full qualified SAP instance name. e.g. P01_DVEBMGS00_sapp01ci instance name: SID_INSTANCE_VIR-HOSTNAME The full qualified path where to find sapstartsrv and sapcontrol. path of sapstartsrv and sapcontrol The full qualified path where to find the SAP START profile. path of start profile The name of the SAP START profile. start profile name After that time in seconds a monitor operation is executed by the resource agent. Does the monitor return SUCCESS, the start is handled as SUCCESS. This is useful to resolve timing problems with e.g. the J2EE-Addin instance. Check the successful start after that time (do not wait for J2EE-Addin) - The SAPInstance resource agent tries to recover a failed start attempt automaticaly one time. This is done by killing runing instance processes and executing cleanipc. + The SAPInstance resource agent tries to recover a failed start attempt automatically one time. This is done by killing running instance processes and executing cleanipc. Enable or disable automatic startup recovery The full qualified path where to find a script or program which should be executed before this resource gets started. path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got started. path to a post-start script The full qualified path where to find a script or program which should be executed before this resource gets stopped. path to a pre-start script The full qualified path where to find a script or program which should be executed after this resource got stopped. path to a post-start script END } # # methods: What methods/operations do we support? # sapinstance_methods() { cat <<-! start stop status monitor validate-all methods meta-data usage ! } # # setup_limits: If sapstartsrv needs to be started by this resource agent we need to ensure that any resource # limits configured in /usr/sap/sapservices are applied. # Since sapstartsrv is started as root and then it downgrades its privileges by calling setuid() and # setgid() any PAM limits at /etc/security/limits.conf are not applied. # Should sapstartsrv need to be started, we look for values configured at /usr/sap/sapservices (as # per SAP note 1437105) and, if found, we apply them before starting sapstartsrv. # Instance processes are started by sapstartsrv and will inherit resource limits from it. # setup_limits() { if [ -r $SAPSERVICES ] then descriptors=`grep "^limit.descriptors" $SAPSERVICES | sed -e "s/limit.descriptors=//" ` if [ -n $descriptors ] then ocf_log info "found valid open file descriptors limit at ${SAPSERVICES}: ${descriptors}, applying..." eval ulimit -n $descriptors fi stacksize=`grep "^limit.stacksize" $SAPSERVICES | sed -e "s/limit.stacksize=//" ` if [ -n $stacksize ] then ocf_log info "found valid stack size limit at ${SAPSERVICES}: ${stacksize}, applying..." eval ulimit -s $stacksize fi datasize=`grep "^limit.datasize" $SAPSERVICES | sed -e "s/limit.datasize=//" ` if [ -n $datasize ] then ocf_log info "found valid process data segment size limit at ${SAPSERVICES}: ${datasize}, applying..." eval ulimit -d $datasize fi fi } # # check_sapstartsrv : Before using sapcontrol we make sure that the sapstartsrv is running for the correct instance. # We cannot use sapinit and the /usr/sap/sapservices file in case of an enquerep instance, # because then we have two instances with the same instance number. # check_sapstartsrv() { restart=0 runninginst="" chkrc=$OCF_SUCCESS output=`$SAPCONTROL -nr $InstanceNr -function ParameterValue INSTANCE_NAME -format script` if [ $? -eq 0 ] then runninginst=`echo "$output" | grep '^0 : ' | cut -d' ' -f3` if [ "$runninginst" != "$InstanceName" ] then ocf_log warn "sapstartsrv is running for instance $runninginst, that service will be killed" restart=1 fi else ocf_log warn "sapstartsrv is not running for instance $SID-$InstanceName, it will be started now" restart=1 fi if [ -z "$runninginst" ]; then runninginst=$InstanceName; fi if [ $restart -eq 1 ] then pkill -9 -f "sapstartsrv.*$runninginst" setup_limits $SAPSTARTSRV pf=$SAPSTARTPROFILE -D -u $sidadm # now make sure the daemon has been started and is able to respond srvrc=1 while [ $srvrc -eq 1 -a `pgrep -f "sapstartsrv.*$runninginst" | wc -l` -gt 0 ] do sleep 1 $SAPCONTROL -nr $InstanceNr -function GetProcessList > /dev/null 2>&1 srvrc=$? done if [ $srvrc -ne 1 ] then ocf_log info "sapstartsrv for instance $SID-$InstanceName was restarted !" chkrc=$OCF_SUCCESS else ocf_log error "sapstartsrv for instance $SID-$InstanceName could not be started!" chkrc=$OCF_NOT_RUNNING fi fi return $chkrc } # # sapuserexit : Many SAP customers need some additional processes/tools to run their SAP systems. # This specialties do not allow a totally generic SAP cluster resource agent. # Someone should write a resource agent for each additional process you need, if it # is required to monitor that process within the cluster manager. To enable # you to extent this resource agent without developing a new one, this user exit # was introduced. # sapuserexit() { NAME="$1" VALUE="$2" if [ -n "$VALUE" ] then if [ -x "$VALUE" ] then ocf_log info "Calling userexit ${NAME} with customer script file ${VALUE}" eval "$VALUE" > /dev/null 2>&1 ocf_log info "Exiting userexit ${NAME} with customer script file ${VALUE}, returncode: $?" else ocf_log warn "Attribute ${NAME} is set to ${VALUE}, but this file is not executable" fi fi return 0 } # # cleanup_instance : remove resources (processes and shared memory) from a crashed instance) # cleanup_instance() { pkill -9 -f -U $sidadm $InstanceName $DIR_EXECUTABLE/cleanipc $InstanceNr remove return 0 } # # sapinstance_start : Start the SAP instance # sapinstance_start() { sapuserexit PRE_START_USEREXIT "$OCF_RESKEY_PRE_START_USEREXIT" rc=$OCF_NOT_RUNNING loopcount=0 while [ $loopcount -lt 2 ] do loopcount=$(($loopcount + 1)) check_sapstartsrv output=`$SAPCONTROL -nr $InstanceNr -function Start` rc=$? ocf_log info "Starting SAP Instance $SID-$InstanceName: $output" if [ $rc -ne 0 ] then ocf_log err "SAP Instance $SID-$InstanceName start failed." return $OCF_ERR_GENERIC fi startrc=1 while [ $startrc -gt 0 ] do waittime_start=`date +%s` output=`$SAPCONTROL -nr $InstanceNr -function WaitforStarted $OCF_RESKEY_START_WAITTIME 10` startrc=$? waittime_stop=`date +%s` if [ $startrc -ne 0 ] then if [ $(($waittime_stop - $waittime_start)) -ge $OCF_RESKEY_START_WAITTIME ] then sapinstance_monitor NOLOG if [ $? -eq $OCF_SUCCESS ] then output="START_WAITTIME ($OCF_RESKEY_START_WAITTIME) has elapsed, but instance monitor returned SUCCESS. Instance considered running." startrc=0; loopcount=2 fi else if [ $loopcount -eq 1 -a $OCF_RESKEY_AUTOMATIC_RECOVER -eq 1 ] then ocf_log warn "SAP Instance $SID-$InstanceName start failed: $output" ocf_log warn "Try to recover $SID-$InstanceName" cleanup_instance else loopcount=2 fi startrc=-1 fi else loopcount=2 fi done done if [ $startrc -eq 0 ] then ocf_log info "SAP Instance $SID-$InstanceName started: $output" rc=$OCF_SUCCESS sapuserexit POST_START_USEREXIT "$OCF_RESKEY_POST_START_USEREXIT" else ocf_log err "SAP Instance $SID-$InstanceName start failed: $output" rc=$OCF_NOT_RUNNING fi return $rc } # # sapinstance_recover: Try startup of failed instance by cleaning up resources # sapinstance_recover() { cleanup_instance sapinstance_start return $? } # # sapinstance_stop: Stop the SAP instance # sapinstance_stop() { sapuserexit PRE_STOP_USEREXIT "$OCF_RESKEY_PRE_STOP_USEREXIT" check_sapstartsrv output=`$SAPCONTROL -nr $InstanceNr -function Stop` if [ $? -eq 0 ] then output=`$SAPCONTROL -nr $InstanceNr -function WaitforStopped 3600 1` if [ $? -eq 0 ] then ocf_log info "SAP Instance $SID-$InstanceName stopped: $output" rc=$OCF_SUCCESS else ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" rc=$OCF_ERR_GENERIC fi else ocf_log err "SAP Instance $SID-$InstanceName stop failed: $output" rc=$OCF_ERR_GENERIC fi sapuserexit POST_STOP_USEREXIT "$OCF_RESKEY_POST_STOP_USEREXIT" return $rc } # # sapinstance_monitor: Can the given SAP instance do anything useful? # sapinstance_monitor() { MONLOG=$1 check_sapstartsrv rc=$? if [ $rc -eq $OCF_SUCCESS ] then count=0 LOCALHOST=`hostname` output=`$SAPCONTROL -nr $InstanceNr -host $LOCALHOST -function GetProcessList -format script` # we have to parse the output, because the returncode doesn't tell anything about the instance status for SERVNO in `echo "$output" | grep '^[0-9] ' | cut -d' ' -f1 | sort -u` do COLOR=`echo "$output" | grep "^$SERVNO dispstatus: " | cut -d' ' -f3` SERVICE=`echo "$output" | grep "^$SERVNO name: " | cut -d' ' -f3` STATE=0 case $COLOR in GREEN|YELLOW) STATE=$OCF_SUCCESS;; *) STATE=$OCF_NOT_RUNNING;; esac case $SERVICE in disp+work|msg_server|enserver|enrepserver|jcontrol|jstart) if [ $STATE -eq $OCF_NOT_RUNNING ] then if [ "$MONLOG" != "NOLOG" ] then ocf_log err "SAP instance service $SERVICE is not running with status $COLOR !" fi rc=$STATE fi count=1;; *);; esac done if [ $count -eq 0 -a $rc -eq $OCF_SUCCESS ] then if [ "$MONLOG" != "NOLOG" ] then ocf_log err "The SAP instance does not run any services which this RA could monitor!" fi rc=$OCF_ERR_ARGS fi fi return $rc } # # sapinstance_validate: Check the symantic of the input parameters # sapinstance_validate() { rc=$OCF_SUCCESS if [ `echo "$SID" | grep -c '^[A-Z][A-Z0-9][A-Z0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$SID' is not a valid system ID!" rc=$OCF_ERR_ARGS fi if [ `echo "$InstanceName" | grep -c '^[A-Z].*[0-9][0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$InstanceName' is not a valid instance name!" rc=$OCF_ERR_ARGS fi if [ `echo "$InstanceNr" | grep -c '^[0-9][0-9]$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$InstanceNr' is not a valid instance number!" rc=$OCF_ERR_ARGS fi if [ `echo "$SAPVIRHOST" | grep -c '^[A-Za-z][A-Za-z0-9_-]*$'` -ne 1 ] then ocf_log err "Parsing instance profile name: '$SAPVIRHOST' is not a valid hostname!" rc=$OCF_ERR_ARGS fi return $rc } # # 'main' starts here... # if ( [ $# -ne 1 ] ) then usage exit $OCF_ERR_ARGS fi # These operations don't require OCF instance parameters to be set case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage) usage exit $OCF_SUCCESS;; methods) sapinstance_methods exit $?;; *);; esac US=`id -u -n` US=`echo $US` if [ $US != root ] then ocf_log err "$0 must be run as root" exit $OCF_ERR_PERM fi # parameter check if [ -z "$OCF_RESKEY_InstanceName" ] then ocf_log err "Please set OCF_RESKEY_InstanceName to the name to the SAP instance profile!" exit $OCF_ERR_ARGS fi SID=`echo "$OCF_RESKEY_InstanceName" | cut -d_ -f1` InstanceName=`echo "$OCF_RESKEY_InstanceName" | cut -d_ -f2` InstanceNr=`echo "$InstanceName" | sed 's/.*\([0-9][0-9]\)$/\1/'` SAPVIRHOST=`echo "$OCF_RESKEY_InstanceName" | cut -d_ -f3` # optional OCF parameters, we try to guess which directories are correct if [ -z "$OCF_RESKEY_DIR_EXECUTABLE" ] then if [ -x /usr/sap/$SID/$InstanceName/exe/sapstartsrv -a -x /usr/sap/$SID/$InstanceName/exe/sapcontrol ] then DIR_EXECUTABLE="/usr/sap/$SID/$InstanceName/exe" SAPSTARTSRV="/usr/sap/$SID/$InstanceName/exe/sapstartsrv" SAPCONTROL="/usr/sap/$SID/$InstanceName/exe/sapcontrol" elif [ -x /usr/sap/$SID/SYS/exe/run/sapstartsrv -a -x /usr/sap/$SID/SYS/exe/run/sapcontrol ] then DIR_EXECUTABLE="/usr/sap/$SID/SYS/exe/run" SAPSTARTSRV="/usr/sap/$SID/SYS/exe/run/sapstartsrv" SAPCONTROL="/usr/sap/$SID/SYS/exe/run/sapcontrol" else ocf_log warn "Cannot find sapstartsrv and sapcontrol executable, please set DIR_EXECUTABLE parameter!" exit $OCF_NOT_RUNNING fi else DIR_EXECUTABLE="$OCF_RESKEY_DIR_EXECUTABLE" SAPSTARTSRV="$OCF_RESKEY_DIR_EXECUTABLE/sapstartsrv" SAPCONTROL="$OCF_RESKEY_DIR_EXECUTABLE/sapcontrol" fi if [ -z "$OCF_RESKEY_DIR_PROFILE" ] then if [ -d /usr/sap/$SID/SYS/profile/ ] then DIR_PROFILE="/usr/sap/$SID/SYS/profile" else ocf_log warn "Expected /usr/sap/$SID/SYS/profile/ to be a directory, please set DIR_PROFILE parameter!" exit $OCF_NOT_RUNNING fi else DIR_PROFILE="$OCF_RESKEY_DIR_PROFILE" fi if [ -z "$OCF_RESKEY_START_PROFILE" ] then SAPSTARTPROFILE="$DIR_PROFILE/START_${InstanceName}_${SAPVIRHOST}" if [ ! -r $SAPSTARTPROFILE ] then ocf_log warn "Expected $SAPSTARTPROFILE to be the instance START profile, please set START_PROFILE parameter!" exit $OCF_NOT_RUNNING fi else SAPSTARTPROFILE="$OCF_RESKEY_START_PROFILE" fi if [ -z "$OCF_RESKEY_START_WAITTIME" ] then OCF_RESKEY_START_WAITTIME=3600 fi if [ -z "$OCF_RESKEY_AUTOMATIC_RECOVER" ] then OCF_RESKEY_AUTOMATIC_RECOVER=0 else case "$OCF_RESKEY_AUTOMATIC_RECOVER" in 1|true|TRUE|yes|YES) OCF_RESKEY_AUTOMATIC_RECOVER=1;; 0|false|FALSE|no|NO) OCF_RESKEY_AUTOMATIC_RECOVER=0;; esac fi # as root user we need the library path to the SAP kernel to be able to call sapcontrol if [ `echo $LD_LIBRARY_PATH | grep -c "^$DIR_EXECUTABLE\>"` -eq 0 ]; then LD_LIBRARY_PATH=$DIR_EXECUTABLE${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH export LD_LIBRARY_PATH fi sidadm="`echo $SID | tr '[:upper:]' '[:lower:]'`adm" SAPSERVICES=/usr/sap/sapservices # What kind of method was invoked? case "$1" in start) sapinstance_start exit $?;; stop) sapinstance_stop exit $?;; status|monitor) sapinstance_monitor exit $?;; validate-all) sapinstance_validate exit $?;; *) sapinstance_methods exit $OCF_ERR_UNIMPLEMENTED;; esac diff --git a/rgmanager/src/resources/db2.sh.in b/rgmanager/src/resources/db2.sh.in index cfedf1b2e..66125aa44 100644 --- a/rgmanager/src/resources/db2.sh.in +++ b/rgmanager/src/resources/db2.sh.in @@ -1,133 +1,133 @@ #!@BASH_SHELL@ # # Copyright (c) 2011 Holger Teutsch # Copyright (c) 2014 David Vossel # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # NOTE: # # This agent is a wrapper around the heartbeat/db2 agent which limits the heartbeat # db2 agent to Standard role support. This allows cluster managers such as rgmanager # which do not have multi-state resource support to manage db2 instances with # a limited feature set. # export LC_ALL=C export LANG=C export PATH=/bin:/sbin:/usr/bin:/usr/sbin . $(dirname $0)/ocf-shellfuncs meta_data() { cat < 1.0 Resource Agent that manages an IBM DB2 LUW databases in Standard role. Multiple partitions are supported. When partitions are in use, each partition must be configured as a separate primitive resource. Resource Agent that manages an IBM DB2 LUW databases in Standard role with multiple partition support. The instance of the database(s). instance List of databases to be managed, e.g "db1 db2". Defaults to all databases in the instance. List of databases to be managed -The number of the partion (DBPARTITIONNUM) to be managed. +The number of the partition (DBPARTITIONNUM) to be managed. database partition number (DBPARTITIONNUM) END } heartbeat_db2_wrapper() { # default heartbeat agent ocf root. export OCF_ROOT=/usr/lib/ocf heartbeat_db2="${OCF_ROOT}/resource.d/heartbeat/db2" if ! [ -a $heartbeat_db2 ]; then echo "heartbeat db2 agent not found at '${heartbeat_db2}'" exit $OCF_ERR_INSTALLED fi $heartbeat_db2 $1 } case $1 in meta-data) meta_data exit 0 ;; validate-all) heartbeat_db2_wrapper $1 exit $? ;; start) heartbeat_db2_wrapper $1 exit $? ;; stop) heartbeat_db2_wrapper $1 exit $? ;; status|monitor) heartbeat_db2_wrapper "monitor" exit $? ;; restart) heartbeat_db2_wrapper "stop" rc=$? if [ $rc -ne 0 ]; then exit $rc fi heartbeat_db2_wrapper "start" exit $? ;; *) echo "Usage: db2.sh {start|stop|monitor|validate-all|meta-data}" exit $OCF_ERR_UNIMPLEMENTED ;; esac diff --git a/tools/ocft/db2 b/tools/ocft/db2 index 7013a992f..5c4d6ea28 100644 --- a/tools/ocft/db2 +++ b/tools/ocft/db2 @@ -1,164 +1,164 @@ # db2 # -# This test assumes a db2 ESE instance with two partions and a database. +# This test assumes a db2 ESE instance with two partitions and a database. # Default is instance=db2inst1, database=ocft # adapt this in set_testenv below # # Simple steps to generate a test environment (if you don't have one): # # A virtual machine with 1200MB RAM is sufficient # # - download an eval version of DB2 server from IBM # - create an user "db2inst1" in group "db2inst1" # # As root # - install DB2 software in some location # - create instance # cd /instance # ./db2icrt -s ese -u db2inst1 db2inst1 # - adapt profile of db2inst1 as instructed by db2icrt # # As db2inst1 # # allow to run with small memory footprint # db2set DB2_FCM_SETTINGS=FCM_MAXIMIZE_SET_SIZE:FALSE # db2start # db2start dbpartitionnum 1 add dbpartitionnum hostname $(uname -n) port 1 without tablespaces # db2stop # db2start # db2 create database ocft # Done # In order to install a real cluster refer to http://www.linux-ha.org/wiki/db2_(resource_agent) CONFIG Agent db2 AgentRoot /usr/lib/ocf/resource.d/heartbeat HangTimeout 40 SETUP-AGENT # nothing CASE-BLOCK set_testenv Env OCFT_instance=db2inst1 Env OCFT_db=ocft CASE-BLOCK crm_setting Env OCF_RESKEY_instance=$OCFT_instance Env OCF_RESKEY_CRM_meta_timeout=30000 CASE-BLOCK default_status AgentRun stop CASE-BLOCK prepare Include set_testenv Include crm_setting Include default_status CASE "check base env" Include prepare AgentRun start OCF_SUCCESS CASE "check base env: invalid 'OCF_RESKEY_instance'" Include prepare Env OCF_RESKEY_instance=no_such AgentRun start OCF_ERR_INSTALLED CASE "invalid instance config" Include prepare Bash eval mv ~$OCFT_instance/sqllib ~$OCFT_instance/sqllib- BashAtExit eval mv ~$OCFT_instance/sqllib- ~$OCFT_instance/sqllib AgentRun start OCF_ERR_INSTALLED CASE "unimplemented command" Include prepare AgentRun no_cmd OCF_ERR_UNIMPLEMENTED CASE "normal start" Include prepare AgentRun start OCF_SUCCESS CASE "normal stop" Include prepare AgentRun start AgentRun stop OCF_SUCCESS CASE "double start" Include prepare AgentRun start AgentRun start OCF_SUCCESS CASE "double stop" Include prepare AgentRun stop OCF_SUCCESS CASE "started: monitor" Include prepare AgentRun start AgentRun monitor OCF_SUCCESS CASE "not started: monitor" Include prepare AgentRun monitor OCF_NOT_RUNNING CASE "killed instance: monitor" Include prepare AgentRun start OCF_SUCCESS AgentRun monitor OCF_SUCCESS BashAtExit rm /tmp/ocft-helper1 Bash echo "su $OCFT_instance -c '. ~$OCFT_instance/sqllib/db2profile; db2nkill 0 >/dev/null 2>&1'" > /tmp/ocft-helper1 Bash sh -x /tmp/ocft-helper1 AgentRun monitor OCF_NOT_RUNNING CASE "overload param instance by admin" Include prepare Env OCF_RESKEY_instance=no_such Env OCF_RESKEY_admin=$OCFT_instance AgentRun start OCF_SUCCESS CASE "check start really activates db" Include prepare AgentRun start OCF_SUCCESS BashAtExit rm /tmp/ocft-helper2 Bash echo "su $OCFT_instance -c '. ~$OCFT_instance/sqllib/db2profile; db2 get snapshot for database on $OCFT_db>/dev/null'" > /tmp/ocft-helper2 Bash sh -x /tmp/ocft-helper2 CASE "multipartion test" Include prepare AgentRun start OCF_SUCCESS AgentRun monitor OCF_SUCCESS # start does not start partion 1 Env OCF_RESKEY_dbpartitionnum=1 AgentRun monitor OCF_NOT_RUNNING # now start 1 AgentRun start OCF_SUCCESS AgentRun monitor OCF_SUCCESS # now stop 1 AgentRun stop OCF_SUCCESS AgentRun monitor OCF_NOT_RUNNING # does not affect 0 Env OCF_RESKEY_dbpartitionnum=0 AgentRun monitor OCF_SUCCESS # fault injection does not work on the 1.0.4 client due to a hardcoded path CASE "simulate hanging db2stop (not meaningful for 1.0.4 agent)" Include prepare AgentRun start OCF_SUCCESS Bash [ ! -f /usr/local/bin/db2stop ] BashAtExit rm /usr/local/bin/db2stop Bash echo -e "#!/bin/sh\necho fake db2stop\nsleep 10000" > /usr/local/bin/db2stop Bash chmod +x /usr/local/bin/db2stop AgentRun stop OCF_SUCCESS # fault injection does not work on the 1.0.4 client due to a hardcoded path CASE "simulate not stopping db2stop (not meaningful for 1.0.4 agent)" Include prepare AgentRun start OCF_SUCCESS Bash [ ! -f /usr/local/bin/db2stop ] BashAtExit rm /usr/local/bin/db2stop Bash echo -e "#!/bin/sh\necho fake db2stop\nexit 0" > /usr/local/bin/db2stop Bash chmod +x /usr/local/bin/db2stop AgentRun stop OCF_SUCCESS diff --git a/tools/send_arp.libnet.c b/tools/send_arp.libnet.c index 7fdfb06aa..c09de86f0 100644 --- a/tools/send_arp.libnet.c +++ b/tools/send_arp.libnet.c @@ -1,758 +1,758 @@ /* * send_arp * * This program sends out one ARP packet with source/target IP and Ethernet * hardware addresses suuplied by the user. It uses the libnet libary from * Packet Factory (http://www.packetfactory.net/libnet/ ). It has been tested * on Linux, FreeBSD, and on Solaris. * * This inspired by the sample application supplied by Packet Factory. * Matt Soffen * Copyright (C) 2001 Matt Soffen * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* Needs to be defined before any other includes, otherwise some system * headers do not behave as expected! Major black magic... */ #undef _GNU_SOURCE /* in case it was defined on the command line */ #define _GNU_SOURCE #include #include #define USE_GNU #if defined(ANSI_ONLY) && !defined(inline) # define inline /* nothing */ #endif #include #include #include #include #include #include #ifdef HAVE_LIBNET_1_0_API # define LTYPE struct libnet_link_int static u_char *mk_packet(u_int32_t ip, u_char *device, u_char *macaddr, u_char *broadcast, u_char *netmask, u_short arptype); static int send_arp(struct libnet_link_int *l, u_char *device, u_char *buf); #endif #ifdef HAVE_LIBNET_1_1_API # define LTYPE libnet_t static libnet_t *mk_packet(libnet_t* lntag, u_int32_t ip, u_char *device, u_char macaddr[6], u_char *broadcast, u_char *netmask, u_short arptype); int send_arp(libnet_t* lntag); #endif #define PIDDIR HA_VARRUNDIR "/" PACKAGE #define PIDFILE_BASE PIDDIR "/send_arp-" static char print_usage[]={ "send_arp: sends out custom ARP packet.\n" " usage: send_arp [-i repeatinterval-ms] [-r repeatcount] [-p pidfile] \\\n" " device src_ip_addr src_hw_addr broadcast_ip_addr netmask\n" "\n" " where:\n" " repeatinterval-ms: timing, in milliseconds of sending arp packets\n" " For each ARP announcement requested, a pair of ARP packets is sent,\n" " an ARP request, and an ARP reply. This is because some systems\n" " ignore one or the other, and this combination gives the greatest\n" " chance of success.\n" "\n" " Each time an ARP is sent, if another ARP will be sent then\n" " the code sleeps for half of repeatinterval-ms.\n" "\n" " repeatcount: how many pairs of ARP packets to send.\n" " See above for why pairs are sent\n" "\n" " pidfile: pid file to use\n" "\n" " device: network interface to use\n" "\n" " src_ip_addr: source ip address\n" "\n" " src_hw_addr: source hardware address.\n" " If \"auto\" then the address of device\n" "\n" " broadcast_ip_addr: ignored\n" "\n" " netmask: ignored\n" }; static const char * SENDARPNAME = "send_arp"; static void convert_macaddr (u_char *macaddr, u_char enet_src[6]); static int get_hw_addr(char *device, u_char mac[6]); int write_pid_file(const char *pidfilename); int create_pid_directory(const char *piddirectory); #define AUTO_MAC_ADDR "auto" #ifndef LIBNET_ERRBUF_SIZE # define LIBNET_ERRBUF_SIZE 256 #endif /* * For use logd, should keep identical with the same const variables defined * in heartbeat.h. */ #define ENV_PREFIX "HA_" #define KEY_LOGDAEMON "use_logd" static void byebye(int nsig) { (void)nsig; /* Avoid an "error exit" log message if we're killed */ exit(0); } int main(int argc, char *argv[]) { int c = -1; char errbuf[LIBNET_ERRBUF_SIZE]; char* device; char* ipaddr; char* macaddr; char* broadcast; char* netmask; u_int32_t ip; u_char src_mac[6]; int repeatcount = 1; int j; long msinterval = 1000; int flag; char pidfilenamebuf[64]; char *pidfilename = NULL; #ifdef HAVE_LIBNET_1_0_API LTYPE* l; u_char *request, *reply; #elif defined(HAVE_LIBNET_1_1_API) LTYPE *request, *reply; #endif CL_SIGNAL(SIGTERM, byebye); CL_SIGINTERRUPT(SIGTERM, 1); cl_log_set_entity(SENDARPNAME); cl_log_enable_stderr(TRUE); cl_log_set_facility(LOG_USER); cl_inherit_logging_environment(0); while ((flag = getopt(argc, argv, "i:r:p:")) != EOF) { switch(flag) { case 'i': msinterval= atol(optarg); break; case 'r': repeatcount= atoi(optarg); break; case 'p': pidfilename= optarg; break; default: fprintf(stderr, "%s\n\n", print_usage); return 1; break; } } if (argc-optind != 5) { fprintf(stderr, "%s\n\n", print_usage); return 1; } /* * argv[optind+1] DEVICE dc0,eth0:0,hme0:0, * argv[optind+2] IP 192.168.195.186 * argv[optind+3] MAC ADDR 00a0cc34a878 * argv[optind+4] BROADCAST 192.168.195.186 * argv[optind+5] NETMASK ffffffffffff */ device = argv[optind]; ipaddr = argv[optind+1]; macaddr = argv[optind+2]; broadcast = argv[optind+3]; netmask = argv[optind+4]; if (!pidfilename) { if (snprintf(pidfilenamebuf, sizeof(pidfilenamebuf), "%s%s", PIDFILE_BASE, ipaddr) >= (int)sizeof(pidfilenamebuf)) { cl_log(LOG_INFO, "Pid file truncated"); return EXIT_FAILURE; } pidfilename = pidfilenamebuf; } if(write_pid_file(pidfilename) < 0) { return EXIT_FAILURE; } if (!strcasecmp(macaddr, AUTO_MAC_ADDR)) { if (get_hw_addr(device, src_mac) < 0) { cl_log(LOG_ERR, "Cannot find mac address for %s", device); unlink(pidfilename); return EXIT_FAILURE; } } else { convert_macaddr((unsigned char *)macaddr, src_mac); } /* * We need to send both a broadcast ARP request as well as the ARP response we * were already sending. All the interesting research work for this fix was * done by Masaki Hasegawa and his colleagues. */ #if defined(HAVE_LIBNET_1_0_API) #ifdef ON_DARWIN if ((ip = libnet_name_resolve((unsigned char*)ipaddr, 1)) == -1UL) { #else if ((ip = libnet_name_resolve(ipaddr, 1)) == -1UL) { #endif cl_log(LOG_ERR, "Cannot resolve IP address [%s]", ipaddr); unlink(pidfilename); return EXIT_FAILURE; } l = libnet_open_link_interface(device, errbuf); if (!l) { cl_log(LOG_ERR, "libnet_open_link_interface on %s: %s" , device, errbuf); unlink(pidfilename); return EXIT_FAILURE; } request = mk_packet(ip, (unsigned char*)device, src_mac , (unsigned char*)broadcast, (unsigned char*)netmask , ARPOP_REQUEST); reply = mk_packet(ip, (unsigned char*)device, src_mac , (unsigned char *)broadcast , (unsigned char *)netmask, ARPOP_REPLY); if (!request || !reply) { cl_log(LOG_ERR, "could not create packets"); unlink(pidfilename); return EXIT_FAILURE; } for (j=0; j < repeatcount; ++j) { c = send_arp(l, (unsigned char*)device, request); if (c < 0) { break; } mssleep(msinterval / 2); c = send_arp(l, (unsigned char*)device, reply); if (c < 0) { break; } if (j != repeatcount-1) { mssleep(msinterval / 2); } } #elif defined(HAVE_LIBNET_1_1_API) if ((request=libnet_init(LIBNET_LINK, device, errbuf)) == NULL) { cl_log(LOG_ERR, "libnet_init failure on %s: %s", device, errbuf); unlink(pidfilename); return EXIT_FAILURE; } if ((reply=libnet_init(LIBNET_LINK, device, errbuf)) == NULL) { cl_log(LOG_ERR, "libnet_init failure on %s: %s", device, errbuf); unlink(pidfilename); return EXIT_FAILURE; } if ((signed)(ip = libnet_name2addr4(request, ipaddr, 1)) == -1) { cl_log(LOG_ERR, "Cannot resolve IP address [%s]", ipaddr); unlink(pidfilename); return EXIT_FAILURE; } request = mk_packet(request, ip, (unsigned char*)device, src_mac , (unsigned char*)broadcast, (unsigned char*)netmask , ARPOP_REQUEST); reply = mk_packet(reply, ip, (unsigned char*)device, src_mac , (unsigned char *)broadcast , (unsigned char *)netmask, ARPOP_REPLY); if (!request || !reply) { cl_log(LOG_ERR, "could not create packets"); unlink(pidfilename); return EXIT_FAILURE; } for (j=0; j < repeatcount; ++j) { c = send_arp(request); if (c < 0) { break; } mssleep(msinterval / 2); c = send_arp(reply); if (c < 0) { break; } if (j != repeatcount-1) { mssleep(msinterval / 2); } } #else # error "Must have LIBNET API version defined." #endif unlink(pidfilename); return c < 0 ? EXIT_FAILURE : EXIT_SUCCESS; } void convert_macaddr (u_char *macaddr, u_char enet_src[6]) { int i, pos; u_char bits[3]; pos = 0; for (i = 0; i < 6; i++) { /* Inserted to allow old-style MAC addresses */ if (*macaddr == ':') { pos++; } bits[0] = macaddr[pos++]; bits[1] = macaddr[pos++]; bits[2] = '\0'; enet_src[i] = strtol((const char *)bits, (char **)NULL, 16); } } #ifdef HAVE_LIBNET_1_0_API int get_hw_addr(char *device, u_char mac[6]) { struct ether_addr *mac_address; struct libnet_link_int *network; char err_buf[LIBNET_ERRBUF_SIZE]; network = libnet_open_link_interface(device, err_buf); if (!network) { fprintf(stderr, "libnet_open_link_interface: %s\n", err_buf); return -1; } mac_address = libnet_get_hwaddr(network, device, err_buf); if (!mac_address) { fprintf(stderr, "libnet_get_hwaddr: %s\n", err_buf); return -1; } memcpy(mac, mac_address->ether_addr_octet, 6); return 0; } #endif #ifdef HAVE_LIBNET_1_1_API int get_hw_addr(char *device, u_char mac[6]) { struct libnet_ether_addr *mac_address; libnet_t *ln; char err_buf[LIBNET_ERRBUF_SIZE]; ln = libnet_init(LIBNET_LINK, device, err_buf); if (!ln) { fprintf(stderr, "libnet_open_link_interface: %s\n", err_buf); return -1; } mac_address = libnet_get_hwaddr(ln); if (!mac_address) { fprintf(stderr, "libnet_get_hwaddr: %s\n", err_buf); return -1; } memcpy(mac, mac_address->ether_addr_octet, 6); return 0; } #endif /* * Notes on send_arp() behaviour. Horms, 15th June 2004 * * 1. Target Hardware Address * (In the ARP portion of the packet) * * a) ARP Reply * * Set to the MAC address we want associated with the VIP, * as per RFC2002 (4.6). * * Previously set to ff:ff:ff:ff:ff:ff * * b) ARP Request * * Set to 00:00:00:00:00:00. According to RFC2002 (4.6) * this value is not used in an ARP request, so the value should * not matter. However, I observed that typically (always?) this value * is set to 00:00:00:00:00:00. It seems harmless enough to follow * this trend. * * Previously set to ff:ff:ff:ff:ff:ff * * 2. Source Hardware Address * (Ethernet Header, not in the ARP portion of the packet) * * Set to the MAC address of the interface that the packet is being * sent to. Actually, due to the way that send_arp is called this would * usually (always?) be the case anyway. Although this value should not * really matter, it seems sensible to set the source address to where * the packet is really coming from. The other obvious choice would be * the MAC address that is being associated for the VIP. Which was the * previous values. Again, these are typically the same thing. * * Previously set to MAC address being associated with the VIP */ #ifdef HAVE_LIBNET_1_0_API u_char * mk_packet(u_int32_t ip, u_char *device, u_char *macaddr, u_char *broadcast, u_char *netmask, u_short arptype) { u_char *buf; u_char *target_mac; u_char device_mac[6]; u_char bcast_mac[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; u_char zero_mac[6] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; if (libnet_init_packet(LIBNET_ARP_H + LIBNET_ETH_H, &buf) == -1) { cl_log(LOG_ERR, "libnet_init_packet memory:"); return NULL; } /* Convert ASCII Mac Address to 6 Hex Digits. */ /* Ethernet header */ if (get_hw_addr((char*)device, device_mac) < 0) { cl_log(LOG_ERR, "Cannot find mac address for %s", device); return NULL; } if (libnet_build_ethernet(bcast_mac, device_mac, ETHERTYPE_ARP, NULL, 0 , buf) == -1) { cl_log(LOG_ERR, "libnet_build_ethernet failed:"); libnet_destroy_packet(&buf); return NULL; } if (arptype == ARPOP_REQUEST) { target_mac = zero_mac; } else if (arptype == ARPOP_REPLY) { target_mac = macaddr; } else { cl_log(LOG_ERR, "unknown arptype"); return NULL; } /* * ARP header */ if (libnet_build_arp(ARPHRD_ETHER, /* Hardware address type */ ETHERTYPE_IP, /* Protocol address type */ 6, /* Hardware address length */ 4, /* Protocol address length */ arptype, /* ARP operation */ macaddr, /* Source hardware addr */ (u_char *)&ip, /* Target hardware addr */ target_mac, /* Destination hw addr */ (u_char *)&ip, /* Target protocol address */ NULL, /* Payload */ 0, /* Payload length */ buf + LIBNET_ETH_H) == -1) { cl_log(LOG_ERR, "libnet_build_arp failed:"); libnet_destroy_packet(&buf); return NULL; } return buf; } #endif /* HAVE_LIBNET_1_0_API */ #ifdef HAVE_LIBNET_1_1_API libnet_t* mk_packet(libnet_t* lntag, u_int32_t ip, u_char *device, u_char macaddr[6], u_char *broadcast, u_char *netmask, u_short arptype) { u_char *target_mac; u_char device_mac[6]; u_char bcast_mac[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; u_char zero_mac[6] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; if (arptype == ARPOP_REQUEST) { target_mac = zero_mac; } else if (arptype == ARPOP_REPLY) { target_mac = macaddr; } else { cl_log(LOG_ERR, "unkonwn arptype:"); return NULL; } /* * ARP header */ if (libnet_build_arp(ARPHRD_ETHER, /* hardware address type */ ETHERTYPE_IP, /* protocol address type */ 6, /* Hardware address length */ 4, /* protocol address length */ arptype, /* ARP operation type */ macaddr, /* sender Hardware address */ (u_int8_t *)&ip, /* sender protocol address */ target_mac, /* target hardware address */ (u_int8_t *)&ip, /* target protocol address */ NULL, /* Payload */ 0, /* Length of payload */ lntag, /* libnet context pointer */ 0 /* packet id */ ) == -1 ) { cl_log(LOG_ERR, "libnet_build_arp failed:"); return NULL; } /* Ethernet header */ if (get_hw_addr((char *)device, device_mac) < 0) { cl_log(LOG_ERR, "Cannot find mac address for %s", device); return NULL; } if (libnet_build_ethernet(bcast_mac, device_mac, ETHERTYPE_ARP, NULL, 0 , lntag, 0) == -1 ) { cl_log(LOG_ERR, "libnet_build_ethernet failed:"); return NULL; } return lntag; } #endif /* HAVE_LIBNET_1_1_API */ #ifdef HAVE_LIBNET_1_0_API int send_arp(struct libnet_link_int *l, u_char *device, u_char *buf) { int n; n = libnet_write_link_layer(l, (char*)device, buf, LIBNET_ARP_H + LIBNET_ETH_H); if (n == -1) { cl_log(LOG_ERR, "libnet_write_link_layer failed"); } return (n); } #endif /* HAVE_LIBNET_1_0_API */ #ifdef HAVE_LIBNET_1_1_API int send_arp(libnet_t* lntag) { int n; n = libnet_write(lntag); if (n == -1) { cl_log(LOG_ERR, "libnet_write failed"); } return (n); } #endif /* HAVE_LIBNET_1_1_API */ int create_pid_directory(const char *pidfilename) { int status; struct stat stat_buf; char *pidfilename_cpy; char *dir; pidfilename_cpy = strdup(pidfilename); if (!pidfilename_cpy) { cl_log(LOG_INFO, "Memory allocation failure: %s\n", strerror(errno)); return -1; } dir = dirname(pidfilename_cpy); status = stat(dir, &stat_buf); if (status < 0 && errno != ENOENT && errno != ENOTDIR) { cl_log(LOG_INFO, "Could not stat pid-file directory " "[%s]: %s", dir, strerror(errno)); free(pidfilename_cpy); return -1; } if (status >= 0) { if (S_ISDIR(stat_buf.st_mode)) { return 0; } cl_log(LOG_INFO, "Pid-File directory exists but is " "not a directory [%s]", dir); free(pidfilename_cpy); return -1; } if (mkdir(dir, S_IRUSR|S_IWUSR|S_IXUSR | S_IRGRP|S_IXGRP) < 0) { /* Did someone else make it while we were trying ? */ if (errno == EEXIST && stat(dir, &stat_buf) >= 0 && S_ISDIR(stat_buf.st_mode)) { return 0; } cl_log(LOG_INFO, "Could not create pid-file directory " "[%s]: %s", dir, strerror(errno)); free(pidfilename_cpy); return -1; } free(pidfilename_cpy); return 0; } int write_pid_file(const char *pidfilename) { int pidfilefd; char pidbuf[11]; unsigned long pid; ssize_t bytes; if (*pidfilename != '/') { cl_log(LOG_INFO, "Invalid pid-file name, must begin with a " "'/' [%s]\n", pidfilename); return -1; } if (create_pid_directory(pidfilename) < 0) { return -1; } while (1) { pidfilefd = open(pidfilename, O_CREAT|O_EXCL|O_RDWR, S_IRUSR|S_IWUSR); if (pidfilefd < 0) { if (errno != EEXIST) { /* Old PID file */ cl_log(LOG_INFO, "Could not open pid-file " "[%s]: %s", pidfilename, strerror(errno)); return -1; } } else { break; } pidfilefd = open(pidfilename, O_RDONLY, S_IRUSR|S_IWUSR); if (pidfilefd < 0) { cl_log(LOG_INFO, "Could not open pid-file " "[%s]: %s", pidfilename, strerror(errno)); return -1; } while (1) { bytes = read(pidfilefd, pidbuf, sizeof(pidbuf)-1); if (bytes < 0) { if (errno == EINTR) { continue; } cl_log(LOG_INFO, "Could not read pid-file " "[%s]: %s", pidfilename, strerror(errno)); return -1; } pidbuf[bytes] = '\0'; break; } if(unlink(pidfilename) < 0) { cl_log(LOG_INFO, "Could not delete pid-file " "[%s]: %s", pidfilename, strerror(errno)); return -1; } if (!bytes) { cl_log(LOG_INFO, "Invalid pid in pid-file " "[%s]: %s", pidfilename, strerror(errno)); return -1; } close(pidfilefd); pid = strtoul(pidbuf, NULL, 10); if (pid == ULONG_MAX && errno == ERANGE) { cl_log(LOG_INFO, "Invalid pid in pid-file " "[%s]: %s", pidfilename, strerror(errno)); return -1; } if (kill(pid, SIGKILL) < 0 && errno != ESRCH) { - cl_log(LOG_INFO, "Error killing old proccess [%lu] " + cl_log(LOG_INFO, "Error killing old process [%lu] " "from pid-file [%s]: %s", pid, pidfilename, strerror(errno)); return -1; } cl_log(LOG_INFO, "Killed old send_arp process [%lu]\n", pid); } if (snprintf(pidbuf, sizeof(pidbuf), "%u" , getpid()) >= (int)sizeof(pidbuf)) { cl_log(LOG_INFO, "Pid too long for buffer [%u]", getpid()); return -1; } while (1) { bytes = write(pidfilefd, pidbuf, strlen(pidbuf)); if (bytes != (ssize_t)strlen(pidbuf)) { if (bytes < 0 && errno == EINTR) { continue; } cl_log(LOG_INFO, "Could not write pid-file " "[%s]: %s", pidfilename, strerror(errno)); return -1; } break; } close(pidfilefd); return 0; }