diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Stonith.xml b/doc/Clusters_from_Scratch/en-US/Ch-Stonith.xml index 3fa6657afb..698bade5bb 100644 --- a/doc/Clusters_from_Scratch/en-US/Ch-Stonith.xml +++ b/doc/Clusters_from_Scratch/en-US/Ch-Stonith.xml @@ -1,140 +1,142 @@ %BOOK_ENTITIES; ]> Configure STONITH
Why You Need STONITH STONITH is an acronym for Shoot-The-Other-Node-In-The-Head. It protects your data from being corrupted by rougue nodes performing concurrent or uncontrolled accesses to your data. An unresponsive node may still be accessing your data or may resume doing so without warning. Pacemaker will use STONITH to ensure the safety of your data by making sure the node is truly offline before allowing the data to be accessed from another node. Pacemaker will also resort to STONITH in the event that a clustered service cannot be stopped. The whole node must be forced offline to make it safe to to start the service elsewhere.
Selecting a STONITH Device It is crucial that the STONITH device can allow the cluster to differentiate between a node failure and a network failure. - A common mistake people make in choosing a STONITH device is to use an onboard power controller that shares power and/or network with the node it controls. In such cases, the cluster cannot be sure if the node is really offline, or active and suffering from a network fault. + A common mistake people make in choosing a STONITH device is to use an onboard power controller that shares power and/or network with the node it controls. + In such cases, the cluster cannot be sure if the node is really offline, or active and suffering from a network fault. + Unfortunately many IPMI and iLO boards fall into this category. Likewise, any device that relies on the machine being active (such as an SSH-based “device” used during testing) is inappropriate.
Configuring STONITH Configuration consists of identifying the proper driver for your desired device and supplying it's required parameters. The installed cluster software can provide this information. The crm tool is used to describe this configuration and then creates and uploads the required XML to the cluster. If your STONITH device can control multiple nodes and supports multiple simultaneous connections, a clone can be created to ptentially speed up recovery. Use the command stonith -L to produce a list of the drivers available with your software. When you identify the driver your interested in, use the stonith -t {type} -n command to produce a list of the required parameters for that driver. [beekhof@pcmk-1 ~]$ stonith -L apcmaster apcmastersmart . . . external/ibmrsa . . [beekhof@pcmk-1 ~]$ stonith -t external/ibmrsa -n hostname ipaddr userid passwd type Hopefully the developers chose names that make sense, if not you can query for some additional information by finding an active cluster node and running: lrmadmin -M stonith {type} pacemaker The output should be XML formatted text containing additional parameter descriptions
Example Assuming we have an IBM BladeCenter containing our two nodes and the management interface is active on 192.168.122.31, then we would use crm and issue the following commands to create a STONITH resource. [root@pcmk-1 ~]# crm crm(live)# cib new stonith INFO: stonith shadow CIB created crm(stonith)# configure primitive rsa-fencing stonith::external/ibmrsa \ params hostname=”pcmk-1 pcmk-2" ipaddr=192.168.122.31 userid=mgmt passwd=abc123 \ type=ibm op monitor interval="60s" crm(stonith)# configure clone Fencing rsa-fencing And finally, since we disabled it earlier, we need to re-enable STONITH. At this point we should have the following configuration.. crm(stonith)# configure property stonith-enabled="true" crm(stonith)# configure show node pcmk-1 node pcmk-2 primitive WebData ocf:linbit:drbd \         params drbd_resource="wwwdata" \         op monitor interval="60s" primitive WebFS ocf:heartbeat:Filesystem \         params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype=”gfs2” primitive WebSite ocf:heartbeat:apache \         params configfile="/etc/httpd/conf/httpd.conf" \         op monitor interval="1min" primitive ClusterIP ocf:heartbeat:IPaddr2 \         params ip=”192.168.122.101” cidr_netmask=”32” clusterip_hash=”sourceip” \         op monitor interval="30s" primitive dlm ocf:pacemaker:controld \         op monitor interval="120s" primitive gfs-control ocf:pacemaker:controld \    params daemon=”gfs_controld.pcmk” args=”-g 0” \         op monitor interval="120s" primitive rsa-fencing stonith::external/ibmrsa \ params hostname=”pcmk-1 pcmk-2" ipaddr=192.168.122.31 userid=mgmt passwd=abc123 type=ibm \ op monitor interval="60s" ms WebDataClone WebData \         meta master-max="2" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" clone Fencing rsa-fencing clone WebFSClone WebFS clone WebIP ClusterIP  \         meta globally-unique=”true” clone-max=”2” clone-node-max=”2” clone WebSiteClone WebSite clone dlm-clone dlm \         meta interleave="true" clone gfs-clone gfs-control \         meta interleave="true" colocation WebFS-with-gfs-control inf: WebFSClone gfs-clone colocation WebSite-with-WebFS inf: WebSiteClone WebFSClone colocation fs_on_drbd inf: WebFSClone WebDataClone:Master colocation gfs-with-dlm inf: gfs-clone dlm-clone colocation website-with-ip inf: WebSiteClone WebIP order WebFS-after-WebData inf: WebDataClone:promote WebFSClone:start order WebSite-after-WebFS inf: WebFSClone WebSiteClone order apache-after-ip inf: WebIP WebSiteClone order start-WebFS-after-gfs-control inf: gfs-clone WebFSClone order start-gfs-after-dlm inf: dlm-clone gfs-clone property $id="cib-bootstrap-options" \         dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \         cluster-infrastructure="openais" \         expected-quorum-votes=”2” \         stonith-enabled="true" \         no-quorum-policy="ignore" rsc_defaults $id="rsc-options" \         resource-stickiness=”100”