diff --git a/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt b/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt index 133077a74b..75a0332530 100644 --- a/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt +++ b/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt @@ -1,909 +1,924 @@ = STONITH = //// We prefer [[ch-stonith]], but older versions of asciidoc don't deal well with that construct for chapter headings //// anchor:ch-stonith[Chapter 13, STONITH] indexterm:[STONITH, Configuration] == What Is STONITH? == STONITH (an acronym for "Shoot The Other Node In The Head"), also called 'fencing', protects your data from being corrupted by rogue nodes or concurrent access. Just because a node is unresponsive, this doesn't mean it isn't accessing your data. The only way to be 100% sure that your data is safe, is to use STONITH so we can be certain that the node is truly offline, before allowing the data to be accessed from another node. STONITH also has a role to play in the event that a clustered service cannot be stopped. In this case, the cluster uses STONITH to force the whole node offline, thereby making it safe to start the service elsewhere. == What STONITH Device Should You Use? == It is crucial that the STONITH device can allow the cluster to differentiate between a node failure and a network one. The biggest mistake people make in choosing a STONITH device is to use a remote power switch (such as many on-board IPMI controllers) that shares power with the node it controls. In such cases, the cluster cannot be sure if the node is really offline, or active and suffering from a network fault. Likewise, any device that relies on the machine being active (such as SSH-based "devices" used during testing) are inappropriate. == Special Treatment of STONITH Resources == STONITH resources are somewhat special in Pacemaker. STONITH may be initiated by pacemaker or by other parts of the cluster (such as resources like DRBD or DLM). To accommodate this, pacemaker does not require the STONITH resource to be in the 'started' state in order to be used, thus allowing reliable use of STONITH devices in such a case. [NOTE] ==== In pacemaker versions 1.1.9 and earlier, this feature either did not exist or did not work well. Only "running" STONITH resources could be used by Pacemaker for fencing, and if another component tried to fence a node while Pacemaker was moving STONITH resources, the fencing could fail. ==== All nodes have access to STONITH devices' definitions and instantiate them on-the-fly when needed, but preference is given to 'verified' instances, which are the ones that are 'started' according to the cluster's knowledge. In the case of a cluster split, the partition with a verified instance will have a slight advantage, because the STONITH daemon in the other partition will have to hear from all its current peers before choosing a node to perform the fencing. Fencing resources do work the same as regular resources in some respects: * +target-role+ can be used to enable or disable the resource * Location constraints can be used to prevent a specific node from using the resource [IMPORTANT] =========== Currently there is a limitation that fencing resources may only have one set of meta-attributes and one set of instance attributes. This can be revisited if it becomes a significant limitation for people. =========== See the table below or run `man stonithd` to see special instance attributes that may be set for any fencing resource, regardless of fence agent. .Properties of Fencing Resources [width="95%",cols="5m,2,3,10 ---- ==== Based on that, we would create a STONITH resource fragment that might look like this: .An IPMI-based STONITH Resource ==== [source,XML] ---- ---- ==== Finally, we need to enable STONITH: ---- # crm_attribute -t crm_config -n stonith-enabled -v true ---- == Advanced STONITH Configurations == Some people consider that having one fencing device is a single point of failure footnote:[Not true, since a node or resource must fail before fencing even has a chance to]; others prefer removing the node from the storage and network instead of turning it off. Whatever the reason, Pacemaker supports fencing nodes with multiple devices through a feature called 'fencing topologies'. Simply create the individual devices as you normally would, then define one or more +fencing-level+ entries in the +fencing-topology+ section of the configuration. * Each fencing level is attempted in order of ascending +index+. Allowed values are 1 through 9. * If a device fails, processing terminates for the current level. No further devices in that level are exercised, and the next level is attempted instead. * If the operation succeeds for all the listed devices in a level, the level is deemed to have passed. * The operation is finished when a level has passed (success), or all levels have been attempted (failed). * If the operation failed, the next step is determined by the Policy Engine and/or `crmd`. Some possible uses of topologies include: * Try poison-pill and fail back to power * Try disk and network, and fall back to power if either fails * Initiate a kdump and then poweroff the node .Properties of Fencing Levels [width="95%",cols="1m,3<",options="header",align="center"] |========================================================= |Field |Description |id |A unique name for the level indexterm:[id,fencing-level] indexterm:[Fencing,fencing-level,id] |target |The name of a single node to which this level applies indexterm:[target,fencing-level] indexterm:[Fencing,fencing-level,target] |target-pattern |A regular expression matching the names of nodes to which this level applies '(since 1.1.14)' indexterm:[target-pattern,fencing-level] indexterm:[Fencing,fencing-level,target-pattern] |target-attribute |The name of a node attribute that is set (to +target-value+) for nodes to which this level applies '(since 1.1.14)' indexterm:[target-attribute,fencing-level] indexterm:[Fencing,fencing-level,target-attribute] |target-value |The node attribute value (of +target-attribute+) that is set for nodes to which this level applies '(since 1.1.14)' indexterm:[target-attribute,fencing-level] indexterm:[Fencing,fencing-level,target-attribute] |index |The order in which to attempt the levels. Levels are attempted in ascending order 'until one succeeds'. Valid values are 1 through 9. indexterm:[index,fencing-level] indexterm:[Fencing,fencing-level,index] |devices |A comma-separated list of devices that must all be tried for this level indexterm:[devices,fencing-level] indexterm:[Fencing,fencing-level,devices] |========================================================= .Fencing topology with different devices for different nodes ==== [source,XML] ---- ... ... ---- ==== === Example Dual-Layer, Dual-Device Fencing Topologies === The following example illustrates an advanced use of +fencing-topology+ in a cluster with the following properties: * 3 nodes (2 active prod-mysql nodes, 1 prod_mysql-rep in standby for quorum purposes) * the active nodes have an IPMI-controlled power board reached at 192.0.2.1 and 192.0.2.2 * the active nodes also have two independent PSUs (Power Supply Units) connected to two independent PDUs (Power Distribution Units) reached at 198.51.100.1 (port 10 and port 11) and 203.0.113.1 (port 10 and port 11) * the first fencing method uses the `fence_ipmi` agent * the second fencing method uses the `fence_apc_snmp` agent targetting 2 fencing devices (one per PSU, either port 10 or 11) * fencing is only implemented for the active nodes and has location constraints * fencing topology is set to try IPMI fencing first then default to a "sure-kill" dual PDU fencing In a normal failure scenario, STONITH will first select +fence_ipmi+ to try to kill the faulty node. Using a fencing topology, if that first method fails, STONITH will then move on to selecting +fence_apc_snmp+ twice: * once for the first PDU * again for the second PDU The fence action is considered successful only if both PDUs report the required status. If any of them fails, STONITH loops back to the first fencing method, +fence_ipmi+, and so on until the node is fenced or fencing action is cancelled. .First fencing method: single IPMI device Each cluster node has it own dedicated IPMI channel that can be called for fencing using the following primitives: [source,XML] ---- ---- .Second fencing method: dual PDU devices Each cluster node also has two distinct power channels controlled by two distinct PDUs. That means a total of 4 fencing devices configured as follows: - Node 1, PDU 1, PSU 1 @ port 10 - Node 1, PDU 2, PSU 2 @ port 10 - Node 2, PDU 1, PSU 1 @ port 11 - Node 2, PDU 2, PSU 2 @ port 11 The matching fencing agents are configured as follows: [source,XML] ---- ---- .Location Constraints To prevent STONITH from trying to run a fencing agent on the same node it is supposed to fence, constraints are placed on all the fencing primitives: [source,XML] ---- ---- .Fencing topology Now that all the fencing resources are defined, it's time to create the right topology. We want to first fence using IPMI and if that does not work, fence both PDUs to effectively and surely kill the node. [source,XML] ---- ---- Please note, in +fencing-topology+, the lowest +index+ value determines the priority of the first fencing method. .Final configuration Put together, the configuration looks like this: [source,XML] ---- ... ... ---- == Remapping Reboots == When the cluster needs to reboot a node, whether because +stonith-action+ is +reboot+ or because a reboot was manually requested (such as by `stonith_admin --reboot`), it will remap that to other commands in two cases: . If the chosen fencing device does not support the +reboot+ command, the cluster will ask it to perform +off+ instead. . If a fencing topology level with multiple devices must be executed, the cluster will ask all the devices to perform +off+, then ask the devices to perform +on+. To understand the second case, consider the example of a node with redundant power supplies connected to intelligent power switches. Rebooting one switch and then the other would have no effect on the node. Turning both switches off, and then on, actually reboots the node. In such a case, the fencing operation will be treated as successful as long as the +off+ commands succeed, because then it is safe for the cluster to recover any resources that were on the node. Timeouts and errors in the +on+ phase will be logged but ignored. When a reboot operation is remapped, any action-specific timeout for the remapped action will be used (for example, +pcmk_off_timeout+ will be used when executing the +off+ command, not +pcmk_reboot_timeout+). [NOTE] ==== In Pacemaker versions 1.1.13 and earlier, reboots will not be remapped in the second case. To achieve the same effect, separate fencing devices for off and on actions must be configured. ==== diff --git a/doc/Pacemaker_Explained/pot/Ch-Stonith.pot b/doc/Pacemaker_Explained/pot/Ch-Stonith.pot index f54021d26a..9a896c3ee8 100644 --- a/doc/Pacemaker_Explained/pot/Ch-Stonith.pot +++ b/doc/Pacemaker_Explained/pot/Ch-Stonith.pot @@ -1,1272 +1,1297 @@ # # AUTHOR , YEAR. # msgid "" msgstr "" "Project-Id-Version: 0\n" "POT-Creation-Date: 2017-05-08 11:19-0500\n" "PO-Revision-Date: 2017-05-08 11:19-0500\n" "Last-Translator: Automatically generated\n" "Language-Team: None\n" "MIME-Version: 1.0\n" "Content-Type: application/x-publican; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" #. Tag: title #, no-c-format msgid "STONITH" msgstr "" #. Tag: para #, no-c-format msgid " STONITHConfiguration Configuration " msgstr "" #. Tag: title #, no-c-format msgid "What Is STONITH?" msgstr "" #. Tag: para #, no-c-format msgid "STONITH (an acronym for \"Shoot The Other Node In The Head\"), also called fencing, protects your data from being corrupted by rogue nodes or concurrent access." msgstr "" #. Tag: para #, no-c-format msgid "Just because a node is unresponsive, this doesn’t mean it isn’t accessing your data. The only way to be 100% sure that your data is safe, is to use STONITH so we can be certain that the node is truly offline, before allowing the data to be accessed from another node." msgstr "" #. Tag: para #, no-c-format msgid "STONITH also has a role to play in the event that a clustered service cannot be stopped. In this case, the cluster uses STONITH to force the whole node offline, thereby making it safe to start the service elsewhere." msgstr "" #. Tag: title #, no-c-format msgid "What STONITH Device Should You Use?" msgstr "" #. Tag: para #, no-c-format msgid "It is crucial that the STONITH device can allow the cluster to differentiate between a node failure and a network one." msgstr "" #. Tag: para #, no-c-format msgid "The biggest mistake people make in choosing a STONITH device is to use a remote power switch (such as many on-board IPMI controllers) that shares power with the node it controls. In such cases, the cluster cannot be sure if the node is really offline, or active and suffering from a network fault." msgstr "" #. Tag: para #, no-c-format msgid "Likewise, any device that relies on the machine being active (such as SSH-based \"devices\" used during testing) are inappropriate." msgstr "" #. Tag: title #, no-c-format msgid "Special Treatment of STONITH Resources" msgstr "" #. Tag: para #, no-c-format msgid "STONITH resources are somewhat special in Pacemaker." msgstr "" #. Tag: para #, no-c-format msgid "STONITH may be initiated by pacemaker or by other parts of the cluster (such as resources like DRBD or DLM). To accommodate this, pacemaker does not require the STONITH resource to be in the started state in order to be used, thus allowing reliable use of STONITH devices in such a case." msgstr "" #. Tag: para #, no-c-format msgid "In pacemaker versions 1.1.9 and earlier, this feature either did not exist or did not work well. Only \"running\" STONITH resources could be used by Pacemaker for fencing, and if another component tried to fence a node while Pacemaker was moving STONITH resources, the fencing could fail." msgstr "" #. Tag: para #, no-c-format msgid "All nodes have access to STONITH devices' definitions and instantiate them on-the-fly when needed, but preference is given to verified instances, which are the ones that are started according to the cluster’s knowledge." msgstr "" #. Tag: para #, no-c-format msgid "In the case of a cluster split, the partition with a verified instance will have a slight advantage, because the STONITH daemon in the other partition will have to hear from all its current peers before choosing a node to perform the fencing." msgstr "" #. Tag: para #, no-c-format msgid "Fencing resources do work the same as regular resources in some respects:" msgstr "" #. Tag: para #, no-c-format msgid "target-role can be used to enable or disable the resource" msgstr "" #. Tag: para #, no-c-format msgid "Location constraints can be used to prevent a specific node from using the resource" msgstr "" #. Tag: para #, no-c-format msgid "Currently there is a limitation that fencing resources may only have one set of meta-attributes and one set of instance attributes. This can be revisited if it becomes a significant limitation for people." msgstr "" #. Tag: para #, no-c-format msgid "See the table below or run man stonithd to see special instance attributes that may be set for any fencing resource, regardless of fence agent." msgstr "" #. Tag: title #, no-c-format msgid "Properties of Fencing Resources" msgstr "" #. Tag: entry #, no-c-format msgid "Field" msgstr "" #. Tag: entry #, no-c-format msgid "Type" msgstr "" #. Tag: entry #, no-c-format msgid "Default" msgstr "" #. Tag: entry #, no-c-format msgid "Description" msgstr "" #. Tag: para #, no-c-format msgid "stonith-timeout" msgstr "" #. Tag: para #, no-c-format msgid "NA" msgstr "" #. Tag: para #, no-c-format msgid "Older versions used this to override the default period to wait for a STONITH (reboot, on, off) action to complete for this device. It has been replaced by the pcmk_reboot_timeout and pcmk_off_timeout properties. stonith-timeoutFencing Fencing FencingPropertystonith-timeout Propertystonith-timeout stonith-timeout " msgstr "" #. Tag: para #, no-c-format msgid "priority" msgstr "" #. Tag: para #, no-c-format msgid "integer" msgstr "" #. Tag: para #, no-c-format msgid "0" msgstr "" #. Tag: para #, no-c-format msgid "The priority of the STONITH resource. Devices are tried in order of highest priority to lowest. priorityFencing Fencing FencingPropertypriority Propertypriority priority " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_host_map" msgstr "" #. Tag: para #, no-c-format msgid "string" msgstr "" #. Tag: para #, no-c-format msgid "A mapping of host names to ports numbers for devices that do not support host names. Example: node1:1;node2:2,3 tells the cluster to use port 1 for node1 and ports 2 and 3 for node2. pcmk_host_mapFencing Fencing FencingPropertypcmk_host_map Propertypcmk_host_map pcmk_host_map " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_host_list" msgstr "" #. Tag: para #, no-c-format msgid "A list of machines controlled by this device (optional unless pcmk_host_check is static-list). pcmk_host_listFencing Fencing FencingPropertypcmk_host_list Propertypcmk_host_list pcmk_host_list " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_host_check" msgstr "" #. Tag: para #, no-c-format msgid "dynamic-list" msgstr "" #. Tag: para #, no-c-format msgid "How to determine which machines are controlled by the device. Allowed values:" msgstr "" #. Tag: para #, no-c-format msgid "dynamic-list: query the device" msgstr "" #. Tag: para #, no-c-format msgid "static-list: check the pcmk_host_list attribute" msgstr "" #. Tag: para #, no-c-format msgid "none: assume every device can fence every machine" msgstr "" #. Tag: para #, no-c-format msgid " pcmk_host_checkFencing Fencing FencingPropertypcmk_host_check Propertypcmk_host_check pcmk_host_check " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_delay_max" msgstr "" #. Tag: para #, no-c-format msgid "time" msgstr "" #. Tag: para #, no-c-format msgid "0s" msgstr "" #. Tag: para #, no-c-format -msgid "Enable a random delay of up to the time specified before executing stonith actions. This is sometimes used in two-node clusters to ensure that the nodes don’t fence each other at the same time." +msgid "Enable a random delay of up to the time specified before executing stonith actions. This is sometimes used in two-node clusters to ensure that the nodes don’t fence each other at the same time. The overall delay is derived from this random delay value adding a static delay so that the sum is kept below the maximum delay." msgstr "" #. Tag: para #, no-c-format msgid " pcmk_delay_maxFencing Fencing FencingPropertypcmk_delay_max Propertypcmk_delay_max pcmk_delay_max " msgstr "" +#. Tag: para +#, no-c-format +msgid "pcmk_delay_base" +msgstr "" + +#. Tag: para +#, no-c-format +msgid "time" +msgstr "" + +#. Tag: para +#, no-c-format +msgid "0s" +msgstr "" + +#. Tag: para +#, no-c-format +msgid "Enable a base delay for stonith actions and specify base delay value. This prevents double fencing when different delays are configured on the nodes. The overall delay is derived from a random delay value adding this static delay so that the sum is kept below the maximum delay." +msgstr "" + +#. Tag: para +#, no-c-format +msgid " pcmk_delay_baseFencing Fencing FencingPropertypcmk_delay_base Propertypcmk_delay_base pcmk_delay_base " +msgstr "" + #. Tag: para #, no-c-format msgid "pcmk_action_limit" msgstr "" #. Tag: para #, no-c-format msgid "1" msgstr "" #. Tag: para #, no-c-format msgid "The maximum number of actions that can be performed in parallel on this device, if the cluster option concurrent-fencing is true. -1 is unlimited. (since 1.1.15)" msgstr "" #. Tag: para #, no-c-format msgid " pcmk_action_limitFencing Fencing FencingPropertypcmk_action_limit Propertypcmk_action_limit pcmk_action_limit " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_host_argument" msgstr "" #. Tag: para #, no-c-format msgid "port" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. Which parameter should be supplied to the resource agent to identify the node to be fenced. Some devices do not support the standard port parameter or may provide additional ones. Use this to specify an alternate, device-specific parameter. A value of none tells the cluster not to supply any additional parameters. pcmk_host_argumentFencing Fencing FencingPropertypcmk_host_argument Propertypcmk_host_argument pcmk_host_argument " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_reboot_action" msgstr "" #. Tag: para #, no-c-format msgid "reboot" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. The command to send to the resource agent in order to reboot a node. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. pcmk_reboot_actionFencing Fencing FencingPropertypcmk_reboot_action Propertypcmk_reboot_action pcmk_reboot_action " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_reboot_timeout" msgstr "" #. Tag: para #, no-c-format msgid "60s" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. Specify an alternate timeout to use for reboot actions instead of the value of stonith-timeout. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. pcmk_reboot_timeoutFencing Fencing FencingPropertypcmk_reboot_timeout Propertypcmk_reboot_timeout pcmk_reboot_timeout stonith-timeoutFencing Fencing FencingPropertystonith-timeout Propertystonith-timeout stonith-timeout " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_reboot_retries" msgstr "" #. Tag: para #, no-c-format msgid "2" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. The maximum number of times to retry the reboot command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. pcmk_reboot_retriesFencing Fencing FencingPropertypcmk_reboot_retries Propertypcmk_reboot_retries pcmk_reboot_retries " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_off_action" msgstr "" #. Tag: para #, no-c-format msgid "off" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. The command to send to the resource agent in order to shut down a node. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. pcmk_off_actionFencing Fencing FencingPropertypcmk_off_action Propertypcmk_off_action pcmk_off_action " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_off_timeout" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. Specify an alternate timeout to use for off actions instead of the value of stonith-timeout. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. pcmk_off_timeoutFencing Fencing FencingPropertypcmk_off_timeout Propertypcmk_off_timeout pcmk_off_timeout stonith-timeoutFencing Fencing FencingPropertystonith-timeout Propertystonith-timeout stonith-timeout " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_off_retries" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. The maximum number of times to retry the off command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. pcmk_off_retriesFencing Fencing FencingPropertypcmk_off_retries Propertypcmk_off_retries pcmk_off_retries " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_list_action" msgstr "" #. Tag: para #, no-c-format msgid "list" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. The command to send to the resource agent in order to list nodes. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. pcmk_list_actionFencing Fencing FencingPropertypcmk_list_action Propertypcmk_list_action pcmk_list_action " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_list_timeout" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. Specify an alternate timeout to use for list actions instead of the value of stonith-timeout. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. pcmk_list_timeoutFencing Fencing FencingPropertypcmk_list_timeout Propertypcmk_list_timeout pcmk_list_timeout " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_list_retries" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. The maximum number of times to retry the list command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. pcmk_list_retriesFencing Fencing FencingPropertypcmk_list_retries Propertypcmk_list_retries pcmk_list_retries " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_monitor_action" msgstr "" #. Tag: para #, no-c-format msgid "monitor" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. The command to send to the resource agent in order to report extended status. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. pcmk_monitor_actionFencing Fencing FencingPropertypcmk_monitor_action Propertypcmk_monitor_action pcmk_monitor_action " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_monitor_timeout" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. Specify an alternate timeout to use for monitor actions instead of the value of stonith-timeout. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. pcmk_monitor_timeoutFencing Fencing FencingPropertypcmk_monitor_timeout Propertypcmk_monitor_timeout pcmk_monitor_timeout " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_monitor_retries" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. The maximum number of times to retry the monitor command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. pcmk_monitor_retriesFencing Fencing FencingPropertypcmk_monitor_retries Propertypcmk_monitor_retries pcmk_monitor_retries " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_status_action" msgstr "" #. Tag: para #, no-c-format msgid "status" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. The command to send to the resource agent in order to report status. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. pcmk_status_actionFencing Fencing FencingPropertypcmk_status_action Propertypcmk_status_action pcmk_status_action " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_status_timeout" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. Specify an alternate timeout to use for status actions instead of the value of stonith-timeout. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. pcmk_status_timeoutFencing Fencing FencingPropertypcmk_status_timeout Propertypcmk_status_timeout pcmk_status_timeout " msgstr "" #. Tag: para #, no-c-format msgid "pcmk_status_retries" msgstr "" #. Tag: para #, no-c-format msgid "Advanced use only. The maximum number of times to retry the status command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. pcmk_status_retriesFencing Fencing FencingPropertypcmk_status_retries Propertypcmk_status_retries pcmk_status_retries " msgstr "" #. Tag: title #, no-c-format msgid "Configuring STONITH" msgstr "" #. Tag: para #, no-c-format msgid "Higher-level configuration shells include functionality to simplify the process below, particularly the step for deciding which parameters are required. However since this document deals only with core components, you should refer to the STONITH section of the Clusters from Scratch guide for those details." msgstr "" #. Tag: para #, no-c-format msgid "Find the correct driver:" msgstr "" #. Tag: screen #, no-c-format msgid "# stonith_admin --list-installed" msgstr "" #. Tag: para #, no-c-format msgid "Find the required parameters associated with the device (replacing $AGENT_NAME with the name obtained from the previous step):" msgstr "" #. Tag: screen #, no-c-format msgid "# stonith_admin --metadata --agent $AGENT_NAME" msgstr "" #. Tag: para #, no-c-format msgid "Create a file called stonith.xml containing a primitive resource with a class of stonith, a type equal to the agent name obtained earlier, and a parameter for each of the values returned in the previous step." msgstr "" #. Tag: para #, no-c-format msgid "If the device does not know how to fence nodes based on their uname, you may also need to set the special pcmk_host_map parameter. See man stonithd for details." msgstr "" #. Tag: para #, no-c-format msgid "If the device does not support the list command, you may also need to set the special pcmk_host_list and/or pcmk_host_check parameters. See man stonithd for details." msgstr "" #. Tag: para #, no-c-format msgid "If the device does not expect the victim to be specified with the port parameter, you may also need to set the special pcmk_host_argument parameter. See man stonithd for details." msgstr "" #. Tag: para #, no-c-format msgid "Upload it into the CIB using cibadmin:" msgstr "" #. Tag: screen #, no-c-format msgid "# cibadmin -C -o resources --xml-file stonith.xml" msgstr "" #. Tag: para #, no-c-format msgid "Set stonith-enabled to true:" msgstr "" #. Tag: screen #, no-c-format msgid "# crm_attribute -t crm_config -n stonith-enabled -v true" msgstr "" #. Tag: para #, no-c-format msgid "Once the stonith resource is running, you can test it by executing the following (although you might want to stop the cluster on that machine first):" msgstr "" #. Tag: screen #, no-c-format msgid "# stonith_admin --reboot nodename" msgstr "" #. Tag: title #, no-c-format msgid "Example STONITH Configuration" msgstr "" #. Tag: para #, no-c-format msgid "Assume we have an chassis containing four nodes and an IPMI device active on 192.0.2.1. We would choose the fence_ipmilan driver, and obtain the following list of parameters:" msgstr "" #. Tag: title #, no-c-format msgid "Obtaining a list of STONITH Parameters" msgstr "" #. Tag: screen #, no-c-format msgid "# stonith_admin --metadata -a fence_ipmilan" msgstr "" #. Tag: programlisting #, no-c-format msgid "<resource-agent name=\"fence_ipmilan\" shortdesc=\"Fence agent for IPMI over LAN\">\n" " <symlink name=\"fence_ilo3\" shortdesc=\"Fence agent for HP iLO3\"/>\n" " <symlink name=\"fence_ilo4\" shortdesc=\"Fence agent for HP iLO4\"/>\n" " <symlink name=\"fence_idrac\" shortdesc=\"Fence agent for Dell iDRAC\"/>\n" " <symlink name=\"fence_imm\" shortdesc=\"Fence agent for IBM Integrated Management Module\"/>\n" " <longdesc>\n" " </longdesc>\n" " <vendor-url>\n" " </vendor-url>\n" " <parameters>\n" " <parameter name=\"auth\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-A\"/>\n" " <content type=\"string\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"ipaddr\" unique=\"0\" required=\"1\">\n" " <getopt mixed=\"-a\"/>\n" " <content type=\"string\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"passwd\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-p\"/>\n" " <content type=\"string\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"passwd_script\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-S\"/>\n" " <content type=\"string\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"lanplus\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-P\"/>\n" " <content type=\"boolean\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"login\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-l\"/>\n" " <content type=\"string\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"action\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-o\"/>\n" " <content type=\"string\" default=\"reboot\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"timeout\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-t\"/>\n" " <content type=\"string\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"cipher\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-C\"/>\n" " <content type=\"string\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"method\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-M\"/>\n" " <content type=\"string\" default=\"onoff\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"power_wait\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-T\"/>\n" " <content type=\"string\" default=\"2\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"delay\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-f\"/>\n" " <content type=\"string\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"privlvl\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-L\"/>\n" " <content type=\"string\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " <parameter name=\"verbose\" unique=\"0\" required=\"0\">\n" " <getopt mixed=\"-v\"/>\n" " <content type=\"boolean\"/>\n" " <shortdesc>\n" " </shortdesc>\n" " </parameter>\n" " </parameters>\n" " <actions>\n" " <action name=\"on\"/>\n" " <action name=\"off\"/>\n" " <action name=\"reboot\"/>\n" " <action name=\"status\"/>\n" " <action name=\"diag\"/>\n" " <action name=\"list\"/>\n" " <action name=\"monitor\"/>\n" " <action name=\"metadata\"/>\n" " <action name=\"stop\" timeout=\"20s\"/>\n" " <action name=\"start\" timeout=\"20s\"/>\n" " </actions>\n" "</resource-agent>" msgstr "" #. Tag: para #, no-c-format msgid "Based on that, we would create a STONITH resource fragment that might look like this:" msgstr "" #. Tag: title #, no-c-format msgid "An IPMI-based STONITH Resource" msgstr "" #. Tag: programlisting #, no-c-format msgid "<primitive id=\"Fencing\" class=\"stonith\" type=\"fence_ipmilan\" >\n" " <instance_attributes id=\"Fencing-params\" >\n" " <nvpair id=\"Fencing-passwd\" name=\"passwd\" value=\"testuser\" />\n" " <nvpair id=\"Fencing-login\" name=\"login\" value=\"abc123\" />\n" " <nvpair id=\"Fencing-ipaddr\" name=\"ipaddr\" value=\"192.0.2.1\" />\n" " <nvpair id=\"Fencing-pcmk_host_list\" name=\"pcmk_host_list\" value=\"pcmk-1 pcmk-2\" />\n" " </instance_attributes>\n" " <operations >\n" " <op id=\"Fencing-monitor-10m\" interval=\"10m\" name=\"monitor\" timeout=\"300s\" />\n" " </operations>\n" "</primitive>" msgstr "" #. Tag: para #, no-c-format msgid "Finally, we need to enable STONITH:" msgstr "" #. Tag: title #, no-c-format msgid "Advanced STONITH Configurations" msgstr "" #. Tag: para #, no-c-format msgid "Some people consider that having one fencing device is a single point of failure Not true, since a node or resource must fail before fencing even has a chance to; others prefer removing the node from the storage and network instead of turning it off." msgstr "" #. Tag: para #, no-c-format msgid "Whatever the reason, Pacemaker supports fencing nodes with multiple devices through a feature called fencing topologies." msgstr "" #. Tag: para #, no-c-format msgid "Simply create the individual devices as you normally would, then define one or more fencing-level entries in the fencing-topology section of the configuration." msgstr "" #. Tag: para #, no-c-format msgid "Each fencing level is attempted in order of ascending index. Allowed values are 1 through 9." msgstr "" #. Tag: para #, no-c-format msgid "If a device fails, processing terminates for the current level. No further devices in that level are exercised, and the next level is attempted instead." msgstr "" #. Tag: para #, no-c-format msgid "If the operation succeeds for all the listed devices in a level, the level is deemed to have passed." msgstr "" #. Tag: para #, no-c-format msgid "The operation is finished when a level has passed (success), or all levels have been attempted (failed)." msgstr "" #. Tag: para #, no-c-format msgid "If the operation failed, the next step is determined by the Policy Engine and/or crmd." msgstr "" #. Tag: para #, no-c-format msgid "Some possible uses of topologies include:" msgstr "" #. Tag: para #, no-c-format msgid "Try poison-pill and fail back to power" msgstr "" #. Tag: para #, no-c-format msgid "Try disk and network, and fall back to power if either fails" msgstr "" #. Tag: para #, no-c-format msgid "Initiate a kdump and then poweroff the node" msgstr "" #. Tag: title #, no-c-format msgid "Properties of Fencing Levels" msgstr "" #. Tag: para #, no-c-format msgid "id" msgstr "" #. Tag: para #, no-c-format msgid "A unique name for the level idfencing-level fencing-level Fencingfencing-levelid fencing-levelid id " msgstr "" #. Tag: para #, no-c-format msgid "target" msgstr "" #. Tag: para #, no-c-format msgid "The name of a single node to which this level applies targetfencing-level fencing-level Fencingfencing-leveltarget fencing-leveltarget target " msgstr "" #. Tag: para #, no-c-format msgid "target-pattern" msgstr "" #. Tag: para #, no-c-format msgid "A regular expression matching the names of nodes to which this level applies (since 1.1.14) target-patternfencing-level fencing-level Fencingfencing-leveltarget-pattern fencing-leveltarget-pattern target-pattern " msgstr "" #. Tag: para #, no-c-format msgid "target-attribute" msgstr "" #. Tag: para #, no-c-format msgid "The name of a node attribute that is set (to target-value) for nodes to which this level applies (since 1.1.14) target-attributefencing-level fencing-level Fencingfencing-leveltarget-attribute fencing-leveltarget-attribute target-attribute " msgstr "" #. Tag: para #, no-c-format msgid "target-value" msgstr "" #. Tag: para #, no-c-format msgid "The node attribute value (of target-attribute) that is set for nodes to which this level applies (since 1.1.14) target-attributefencing-level fencing-level Fencingfencing-leveltarget-attribute fencing-leveltarget-attribute target-attribute " msgstr "" #. Tag: para #, no-c-format msgid "index" msgstr "" #. Tag: para #, no-c-format msgid "The order in which to attempt the levels. Levels are attempted in ascending order until one succeeds. Valid values are 1 through 9. indexfencing-level fencing-level Fencingfencing-levelindex fencing-levelindex index " msgstr "" #. Tag: para #, no-c-format msgid "devices" msgstr "" #. Tag: para #, no-c-format msgid "A comma-separated list of devices that must all be tried for this level devicesfencing-level fencing-level Fencingfencing-leveldevices fencing-leveldevices devices " msgstr "" #. Tag: title #, no-c-format msgid "Fencing topology with different devices for different nodes" msgstr "" #. Tag: programlisting #, no-c-format msgid " <cib crm_feature_set=\"3.0.6\" validate-with=\"pacemaker-1.2\" admin_epoch=\"1\" epoch=\"0\" num_updates=\"0\">\n" " <configuration>\n" " ...\n" " <fencing-topology>\n" " <!-- For pcmk-1, try poison-pill and fail back to power -->\n" " <fencing-level id=\"f-p1.1\" target=\"pcmk-1\" index=\"1\" devices=\"poison-pill\"/>\n" " <fencing-level id=\"f-p1.2\" target=\"pcmk-1\" index=\"2\" devices=\"power\"/>\n" "\n" " <!-- For pcmk-2, try disk and network, and fail back to power -->\n" " <fencing-level id=\"f-p2.1\" target=\"pcmk-2\" index=\"1\" devices=\"disk,network\"/>\n" " <fencing-level id=\"f-p2.2\" target=\"pcmk-2\" index=\"2\" devices=\"power\"/>\n" " </fencing-topology>\n" " ...\n" " <configuration>\n" " <status/>\n" "</cib>" msgstr "" #. Tag: title #, no-c-format msgid "Example Dual-Layer, Dual-Device Fencing Topologies" msgstr "" #. Tag: para #, no-c-format msgid "The following example illustrates an advanced use of fencing-topology in a cluster with the following properties:" msgstr "" #. Tag: para #, no-c-format msgid "3 nodes (2 active prod-mysql nodes, 1 prod_mysql-rep in standby for quorum purposes)" msgstr "" #. Tag: para #, no-c-format msgid "the active nodes have an IPMI-controlled power board reached at 192.0.2.1 and 192.0.2.2" msgstr "" #. Tag: para #, no-c-format msgid "the active nodes also have two independent PSUs (Power Supply Units) connected to two independent PDUs (Power Distribution Units) reached at 198.51.100.1 (port 10 and port 11) and 203.0.113.1 (port 10 and port 11)" msgstr "" #. Tag: para #, no-c-format msgid "the first fencing method uses the fence_ipmi agent" msgstr "" #. Tag: para #, no-c-format msgid "the second fencing method uses the fence_apc_snmp agent targetting 2 fencing devices (one per PSU, either port 10 or 11)" msgstr "" #. Tag: para #, no-c-format msgid "fencing is only implemented for the active nodes and has location constraints" msgstr "" #. Tag: para #, no-c-format msgid "fencing topology is set to try IPMI fencing first then default to a \"sure-kill\" dual PDU fencing" msgstr "" #. Tag: para #, no-c-format msgid "In a normal failure scenario, STONITH will first select fence_ipmi to try to kill the faulty node. Using a fencing topology, if that first method fails, STONITH will then move on to selecting fence_apc_snmp twice:" msgstr "" #. Tag: para #, no-c-format msgid "once for the first PDU" msgstr "" #. Tag: para #, no-c-format msgid "again for the second PDU" msgstr "" #. Tag: para #, no-c-format msgid "The fence action is considered successful only if both PDUs report the required status. If any of them fails, STONITH loops back to the first fencing method, fence_ipmi, and so on until the node is fenced or fencing action is cancelled." msgstr "" #. Tag: title #, no-c-format msgid "First fencing method: single IPMI device" msgstr "" #. Tag: para #, no-c-format msgid "Each cluster node has it own dedicated IPMI channel that can be called for fencing using the following primitives:" msgstr "" #. Tag: programlisting #, no-c-format msgid "<primitive class=\"stonith\" id=\"fence_prod-mysql1_ipmi\" type=\"fence_ipmilan\">\n" " <instance_attributes id=\"fence_prod-mysql1_ipmi-instance_attributes\">\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-ipaddr\" name=\"ipaddr\" value=\"192.0.2.1\"/>\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-action\" name=\"action\" value=\"off\"/>\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-login\" name=\"login\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-passwd\" name=\"passwd\" value=\"finishme\"/>\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-verbose\" name=\"verbose\" value=\"true\"/>\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-pcmk_host_list\" name=\"pcmk_host_list\" value=\"prod-mysql1\"/>\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-lanplus\" name=\"lanplus\" value=\"true\"/>\n" " </instance_attributes>\n" "</primitive>\n" "<primitive class=\"stonith\" id=\"fence_prod-mysql2_ipmi\" type=\"fence_ipmilan\">\n" " <instance_attributes id=\"fence_prod-mysql2_ipmi-instance_attributes\">\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-ipaddr\" name=\"ipaddr\" value=\"192.0.2.2\"/>\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-action\" name=\"action\" value=\"off\"/>\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-login\" name=\"login\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-passwd\" name=\"passwd\" value=\"finishme\"/>\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-verbose\" name=\"verbose\" value=\"true\"/>\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-pcmk_host_list\" name=\"pcmk_host_list\" value=\"prod-mysql2\"/>\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-lanplus\" name=\"lanplus\" value=\"true\"/>\n" " </instance_attributes>\n" "</primitive>" msgstr "" #. Tag: title #, no-c-format msgid "Second fencing method: dual PDU devices" msgstr "" #. Tag: para #, no-c-format msgid "Each cluster node also has two distinct power channels controlled by two distinct PDUs. That means a total of 4 fencing devices configured as follows:" msgstr "" #. Tag: para #, no-c-format msgid "Node 1, PDU 1, PSU 1 @ port 10" msgstr "" #. Tag: para #, no-c-format msgid "Node 1, PDU 2, PSU 2 @ port 10" msgstr "" #. Tag: para #, no-c-format msgid "Node 2, PDU 1, PSU 1 @ port 11" msgstr "" #. Tag: para #, no-c-format msgid "Node 2, PDU 2, PSU 2 @ port 11" msgstr "" #. Tag: para #, no-c-format msgid "The matching fencing agents are configured as follows:" msgstr "" #. Tag: programlisting #, no-c-format msgid "<primitive class=\"stonith\" id=\"fence_prod-mysql1_apc1\" type=\"fence_apc_snmp\">\n" " <instance_attributes id=\"fence_prod-mysql1_apc1-instance_attributes\">\n" " <nvpair id=\"fence_prod-mysql1_apc1-instance_attributes-ipaddr\" name=\"ipaddr\" value=\"198.51.100.1\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc1-instance_attributes-action\" name=\"action\" value=\"off\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc1-instance_attributes-port\" name=\"port\" value=\"10\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc1-instance_attributes-login\" name=\"login\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc1-instance_attributes-passwd\" name=\"passwd\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc1-instance_attributes-pcmk_host_list\" name=\"pcmk_host_list\" value=\"prod-mysql1\"/>\n" " </instance_attributes>\n" "</primitive>\n" "<primitive class=\"stonith\" id=\"fence_prod-mysql1_apc2\" type=\"fence_apc_snmp\">\n" " <instance_attributes id=\"fence_prod-mysql1_apc2-instance_attributes\">\n" " <nvpair id=\"fence_prod-mysql1_apc2-instance_attributes-ipaddr\" name=\"ipaddr\" value=\"203.0.113.1\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc2-instance_attributes-action\" name=\"action\" value=\"off\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc2-instance_attributes-port\" name=\"port\" value=\"10\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc2-instance_attributes-login\" name=\"login\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc2-instance_attributes-passwd\" name=\"passwd\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc2-instance_attributes-pcmk_host_list\" name=\"pcmk_host_list\" value=\"prod-mysql1\"/>\n" " </instance_attributes>\n" "</primitive>\n" "<primitive class=\"stonith\" id=\"fence_prod-mysql2_apc1\" type=\"fence_apc_snmp\">\n" " <instance_attributes id=\"fence_prod-mysql2_apc1-instance_attributes\">\n" " <nvpair id=\"fence_prod-mysql2_apc1-instance_attributes-ipaddr\" name=\"ipaddr\" value=\"198.51.100.1\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc1-instance_attributes-action\" name=\"action\" value=\"off\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc1-instance_attributes-port\" name=\"port\" value=\"11\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc1-instance_attributes-login\" name=\"login\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc1-instance_attributes-passwd\" name=\"passwd\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc1-instance_attributes-pcmk_host_list\" name=\"pcmk_host_list\" value=\"prod-mysql2\"/>\n" " </instance_attributes>\n" "</primitive>\n" "<primitive class=\"stonith\" id=\"fence_prod-mysql2_apc2\" type=\"fence_apc_snmp\">\n" " <instance_attributes id=\"fence_prod-mysql2_apc2-instance_attributes\">\n" " <nvpair id=\"fence_prod-mysql2_apc2-instance_attributes-ipaddr\" name=\"ipaddr\" value=\"203.0.113.1\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc2-instance_attributes-action\" name=\"action\" value=\"off\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc2-instance_attributes-port\" name=\"port\" value=\"11\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc2-instance_attributes-login\" name=\"login\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc2-instance_attributes-passwd\" name=\"passwd\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc2-instance_attributes-pcmk_host_list\" name=\"pcmk_host_list\" value=\"prod-mysql2\"/>\n" " </instance_attributes>\n" "</primitive>" msgstr "" #. Tag: title #, no-c-format msgid "Location Constraints" msgstr "" #. Tag: para #, no-c-format msgid "To prevent STONITH from trying to run a fencing agent on the same node it is supposed to fence, constraints are placed on all the fencing primitives:" msgstr "" #. Tag: programlisting #, no-c-format msgid "<constraints>\n" " <rsc_location id=\"l_fence_prod-mysql1_ipmi\" node=\"prod-mysql1\" rsc=\"fence_prod-mysql1_ipmi\" score=\"-INFINITY\"/>\n" " <rsc_location id=\"l_fence_prod-mysql2_ipmi\" node=\"prod-mysql2\" rsc=\"fence_prod-mysql2_ipmi\" score=\"-INFINITY\"/>\n" " <rsc_location id=\"l_fence_prod-mysql1_apc2\" node=\"prod-mysql1\" rsc=\"fence_prod-mysql1_apc2\" score=\"-INFINITY\"/>\n" " <rsc_location id=\"l_fence_prod-mysql1_apc1\" node=\"prod-mysql1\" rsc=\"fence_prod-mysql1_apc1\" score=\"-INFINITY\"/>\n" " <rsc_location id=\"l_fence_prod-mysql2_apc1\" node=\"prod-mysql2\" rsc=\"fence_prod-mysql2_apc1\" score=\"-INFINITY\"/>\n" " <rsc_location id=\"l_fence_prod-mysql2_apc2\" node=\"prod-mysql2\" rsc=\"fence_prod-mysql2_apc2\" score=\"-INFINITY\"/>\n" "</constraints>" msgstr "" #. Tag: title #, no-c-format msgid "Fencing topology" msgstr "" #. Tag: para #, no-c-format msgid "Now that all the fencing resources are defined, it’s time to create the right topology. We want to first fence using IPMI and if that does not work, fence both PDUs to effectively and surely kill the node." msgstr "" #. Tag: programlisting #, no-c-format msgid "<fencing-topology>\n" " <fencing-level devices=\"fence_prod-mysql1_ipmi\" id=\"fencing-2\" index=\"1\" target=\"prod-mysql1\"/>\n" " <fencing-level devices=\"fence_prod-mysql1_apc1,fence_prod-mysql1_apc2\" id=\"fencing-3\" index=\"2\" target=\"prod-mysql1\"/>\n" " <fencing-level devices=\"fence_prod-mysql2_ipmi\" id=\"fencing-0\" index=\"1\" target=\"prod-mysql2\"/>\n" " <fencing-level devices=\"fence_prod-mysql2_apc1,fence_prod-mysql2_apc2\" id=\"fencing-1\" index=\"2\" target=\"prod-mysql2\"/>\n" "</fencing-topology>" msgstr "" #. Tag: para #, no-c-format msgid "Please note, in fencing-topology, the lowest index value determines the priority of the first fencing method." msgstr "" #. Tag: title #, no-c-format msgid "Final configuration" msgstr "" #. Tag: para #, no-c-format msgid "Put together, the configuration looks like this:" msgstr "" #. Tag: programlisting #, no-c-format msgid "<cib admin_epoch=\"0\" crm_feature_set=\"3.0.7\" epoch=\"292\" have-quorum=\"1\" num_updates=\"29\" validate-with=\"pacemaker-1.2\">\n" " <configuration>\n" " <crm_config>\n" " <cluster_property_set id=\"cib-bootstrap-options\">\n" " <nvpair id=\"cib-bootstrap-options-stonith-enabled\" name=\"stonith-enabled\" value=\"true\"/>\n" " <nvpair id=\"cib-bootstrap-options-stonith-action\" name=\"stonith-action\" value=\"off\"/>\n" " <nvpair id=\"cib-bootstrap-options-expected-quorum-votes\" name=\"expected-quorum-votes\" value=\"3\"/>\n" " ...\n" " </cluster_property_set>\n" " </crm_config>\n" " <nodes>\n" " <node id=\"prod-mysql1\" uname=\"prod-mysql1\">\n" " <node id=\"prod-mysql2\" uname=\"prod-mysql2\"/>\n" " <node id=\"prod-mysql-rep1\" uname=\"prod-mysql-rep1\"/>\n" " <instance_attributes id=\"prod-mysql-rep1\">\n" " <nvpair id=\"prod-mysql-rep1-standby\" name=\"standby\" value=\"on\"/>\n" " </instance_attributes>\n" " </node>\n" " </nodes>\n" " <resources>\n" " <primitive class=\"stonith\" id=\"fence_prod-mysql1_ipmi\" type=\"fence_ipmilan\">\n" " <instance_attributes id=\"fence_prod-mysql1_ipmi-instance_attributes\">\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-ipaddr\" name=\"ipaddr\" value=\"192.0.2.1\"/>\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-action\" name=\"action\" value=\"off\"/>\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-login\" name=\"login\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-passwd\" name=\"passwd\" value=\"finishme\"/>\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-verbose\" name=\"verbose\" value=\"true\"/>\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-pcmk_host_list\" name=\"pcmk_host_list\" value=\"prod-mysql1\"/>\n" " <nvpair id=\"fence_prod-mysql1_ipmi-instance_attributes-lanplus\" name=\"lanplus\" value=\"true\"/>\n" " </instance_attributes>\n" " </primitive>\n" " <primitive class=\"stonith\" id=\"fence_prod-mysql2_ipmi\" type=\"fence_ipmilan\">\n" " <instance_attributes id=\"fence_prod-mysql2_ipmi-instance_attributes\">\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-ipaddr\" name=\"ipaddr\" value=\"192.0.2.2\"/>\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-action\" name=\"action\" value=\"off\"/>\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-login\" name=\"login\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-passwd\" name=\"passwd\" value=\"finishme\"/>\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-verbose\" name=\"verbose\" value=\"true\"/>\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-pcmk_host_list\" name=\"pcmk_host_list\" value=\"prod-mysql2\"/>\n" " <nvpair id=\"fence_prod-mysql2_ipmi-instance_attributes-lanplus\" name=\"lanplus\" value=\"true\"/>\n" " </instance_attributes>\n" " </primitive>\n" " <primitive class=\"stonith\" id=\"fence_prod-mysql1_apc1\" type=\"fence_apc_snmp\">\n" " <instance_attributes id=\"fence_prod-mysql1_apc1-instance_attributes\">\n" " <nvpair id=\"fence_prod-mysql1_apc1-instance_attributes-ipaddr\" name=\"ipaddr\" value=\"198.51.100.1\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc1-instance_attributes-action\" name=\"action\" value=\"off\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc1-instance_attributes-port\" name=\"port\" value=\"10\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc1-instance_attributes-login\" name=\"login\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc1-instance_attributes-passwd\" name=\"passwd\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc1-instance_attributes-pcmk_host_list\" name=\"pcmk_host_list\" value=\"prod-mysql1\"/>\n" " </instance_attributes>\n" " </primitive>\n" " <primitive class=\"stonith\" id=\"fence_prod-mysql1_apc2\" type=\"fence_apc_snmp\">\n" " <instance_attributes id=\"fence_prod-mysql1_apc2-instance_attributes\">\n" " <nvpair id=\"fence_prod-mysql1_apc2-instance_attributes-ipaddr\" name=\"ipaddr\" value=\"203.0.113.1\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc2-instance_attributes-action\" name=\"action\" value=\"off\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc2-instance_attributes-port\" name=\"port\" value=\"10\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc2-instance_attributes-login\" name=\"login\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc2-instance_attributes-passwd\" name=\"passwd\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql1_apc2-instance_attributes-pcmk_host_list\" name=\"pcmk_host_list\" value=\"prod-mysql1\"/>\n" " </instance_attributes>\n" " </primitive>\n" " <primitive class=\"stonith\" id=\"fence_prod-mysql2_apc1\" type=\"fence_apc_snmp\">\n" " <instance_attributes id=\"fence_prod-mysql2_apc1-instance_attributes\">\n" " <nvpair id=\"fence_prod-mysql2_apc1-instance_attributes-ipaddr\" name=\"ipaddr\" value=\"198.51.100.1\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc1-instance_attributes-action\" name=\"action\" value=\"off\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc1-instance_attributes-port\" name=\"port\" value=\"11\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc1-instance_attributes-login\" name=\"login\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc1-instance_attributes-passwd\" name=\"passwd\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc1-instance_attributes-pcmk_host_list\" name=\"pcmk_host_list\" value=\"prod-mysql2\"/>\n" " </instance_attributes>\n" " </primitive>\n" " <primitive class=\"stonith\" id=\"fence_prod-mysql2_apc2\" type=\"fence_apc_snmp\">\n" " <instance_attributes id=\"fence_prod-mysql2_apc2-instance_attributes\">\n" " <nvpair id=\"fence_prod-mysql2_apc2-instance_attributes-ipaddr\" name=\"ipaddr\" value=\"203.0.113.1\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc2-instance_attributes-action\" name=\"action\" value=\"off\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc2-instance_attributes-port\" name=\"port\" value=\"11\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc2-instance_attributes-login\" name=\"login\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc2-instance_attributes-passwd\" name=\"passwd\" value=\"fencing\"/>\n" " <nvpair id=\"fence_prod-mysql2_apc2-instance_attributes-pcmk_host_list\" name=\"pcmk_host_list\" value=\"prod-mysql2\"/>\n" " </instance_attributes>\n" " </primitive>\n" " </resources>\n" " <constraints>\n" " <rsc_location id=\"l_fence_prod-mysql1_ipmi\" node=\"prod-mysql1\" rsc=\"fence_prod-mysql1_ipmi\" score=\"-INFINITY\"/>\n" " <rsc_location id=\"l_fence_prod-mysql2_ipmi\" node=\"prod-mysql2\" rsc=\"fence_prod-mysql2_ipmi\" score=\"-INFINITY\"/>\n" " <rsc_location id=\"l_fence_prod-mysql1_apc2\" node=\"prod-mysql1\" rsc=\"fence_prod-mysql1_apc2\" score=\"-INFINITY\"/>\n" " <rsc_location id=\"l_fence_prod-mysql1_apc1\" node=\"prod-mysql1\" rsc=\"fence_prod-mysql1_apc1\" score=\"-INFINITY\"/>\n" " <rsc_location id=\"l_fence_prod-mysql2_apc1\" node=\"prod-mysql2\" rsc=\"fence_prod-mysql2_apc1\" score=\"-INFINITY\"/>\n" " <rsc_location id=\"l_fence_prod-mysql2_apc2\" node=\"prod-mysql2\" rsc=\"fence_prod-mysql2_apc2\" score=\"-INFINITY\"/>\n" " </constraints>\n" " <fencing-topology>\n" " <fencing-level devices=\"fence_prod-mysql1_ipmi\" id=\"fencing-2\" index=\"1\" target=\"prod-mysql1\"/>\n" " <fencing-level devices=\"fence_prod-mysql1_apc1,fence_prod-mysql1_apc2\" id=\"fencing-3\" index=\"2\" target=\"prod-mysql1\"/>\n" " <fencing-level devices=\"fence_prod-mysql2_ipmi\" id=\"fencing-0\" index=\"1\" target=\"prod-mysql2\"/>\n" " <fencing-level devices=\"fence_prod-mysql2_apc1,fence_prod-mysql2_apc2\" id=\"fencing-1\" index=\"2\" target=\"prod-mysql2\"/>\n" " </fencing-topology>\n" " ...\n" " </configuration>\n" "</cib>" msgstr "" #. Tag: title #, no-c-format msgid "Remapping Reboots" msgstr "" #. Tag: para #, no-c-format msgid "When the cluster needs to reboot a node, whether because stonith-action is reboot or because a reboot was manually requested (such as by stonith_admin --reboot), it will remap that to other commands in two cases:" msgstr "" #. Tag: para #, no-c-format msgid "If the chosen fencing device does not support the reboot command, the cluster will ask it to perform off instead." msgstr "" #. Tag: para #, no-c-format msgid "If a fencing topology level with multiple devices must be executed, the cluster will ask all the devices to perform off, then ask the devices to perform on." msgstr "" #. Tag: para #, no-c-format msgid "To understand the second case, consider the example of a node with redundant power supplies connected to intelligent power switches. Rebooting one switch and then the other would have no effect on the node. Turning both switches off, and then on, actually reboots the node." msgstr "" #. Tag: para #, no-c-format msgid "In such a case, the fencing operation will be treated as successful as long as the off commands succeed, because then it is safe for the cluster to recover any resources that were on the node. Timeouts and errors in the on phase will be logged but ignored." msgstr "" #. Tag: para #, no-c-format msgid "When a reboot operation is remapped, any action-specific timeout for the remapped action will be used (for example, pcmk_off_timeout will be used when executing the off command, not pcmk_reboot_timeout)." msgstr "" #. Tag: para #, no-c-format msgid "In Pacemaker versions 1.1.13 and earlier, reboots will not be remapped in the second case. To achieve the same effect, separate fencing devices for off and on actions must be configured." msgstr "" diff --git a/fencing/main.c b/fencing/main.c index 1622362fab..360bc25d10 100644 --- a/fencing/main.c +++ b/fencing/main.c @@ -1,1558 +1,1558 @@ /* * Copyright (C) 2009 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include char *stonith_our_uname = NULL; char *stonith_our_uuid = NULL; long stonith_watchdog_timeout_ms = 0; GMainLoop *mainloop = NULL; gboolean stand_alone = FALSE; gboolean no_cib_connect = FALSE; gboolean stonith_shutdown_flag = FALSE; qb_ipcs_service_t *ipcs = NULL; xmlNode *local_cib = NULL; GHashTable *known_peer_names = NULL; static cib_t *cib_api = NULL; static void *cib_library = NULL; static void stonith_shutdown(int nsig); static void stonith_cleanup(void); static int32_t st_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) { if (stonith_shutdown_flag) { crm_info("Ignoring new client [%d] during shutdown", crm_ipcs_client_pid(c)); return -EPERM; } if (crm_client_new(c, uid, gid) == NULL) { return -EIO; } return 0; } static void st_ipc_created(qb_ipcs_connection_t * c) { crm_trace("Connection created for %p", c); } /* Exit code means? */ static int32_t st_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; int call_options = 0; xmlNode *request = NULL; crm_client_t *c = crm_client_get(qbc); const char *op = NULL; if (c == NULL) { crm_info("Invalid client: %p", qbc); return 0; } request = crm_ipcs_recv(c, data, size, &id, &flags); if (request == NULL) { crm_ipcs_send_ack(c, id, flags, "nack", __FUNCTION__, __LINE__); return 0; } op = crm_element_value(request, F_CRM_TASK); if(safe_str_eq(op, CRM_OP_RM_NODE_CACHE)) { crm_xml_add(request, F_TYPE, T_STONITH_NG); crm_xml_add(request, F_STONITH_OPERATION, op); crm_xml_add(request, F_STONITH_CLIENTID, c->id); crm_xml_add(request, F_STONITH_CLIENTNAME, crm_client_name(c)); crm_xml_add(request, F_STONITH_CLIENTNODE, stonith_our_uname); send_cluster_message(NULL, crm_msg_stonith_ng, request, FALSE); free_xml(request); return 0; } if (c->name == NULL) { const char *value = crm_element_value(request, F_STONITH_CLIENTNAME); if (value == NULL) { value = "unknown"; } c->name = crm_strdup_printf("%s.%u", value, c->pid); } crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); crm_trace("Flags %u/%u for command %u from %s", flags, call_options, id, crm_client_name(c)); if (is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(flags & crm_ipc_client_response); CRM_LOG_ASSERT(c->request_id == 0); /* This means the client has two synchronous events in-flight */ c->request_id = id; /* Reply only to the last one */ } crm_xml_add(request, F_STONITH_CLIENTID, c->id); crm_xml_add(request, F_STONITH_CLIENTNAME, crm_client_name(c)); crm_xml_add(request, F_STONITH_CLIENTNODE, stonith_our_uname); crm_log_xml_trace(request, "Client[inbound]"); stonith_command(c, id, flags, request, NULL); free_xml(request); return 0; } /* Error code means? */ static int32_t st_ipc_closed(qb_ipcs_connection_t * c) { crm_client_t *client = crm_client_get(c); if (client == NULL) { return 0; } crm_trace("Connection %p closed", c); crm_client_destroy(client); /* 0 means: yes, go ahead and destroy the connection */ return 0; } static void st_ipc_destroy(qb_ipcs_connection_t * c) { crm_trace("Connection %p destroyed", c); st_ipc_closed(c); } static void stonith_peer_callback(xmlNode * msg, void *private_data) { const char *remote_peer = crm_element_value(msg, F_ORIG); const char *op = crm_element_value(msg, F_STONITH_OPERATION); if (crm_str_eq(op, "poke", TRUE)) { return; } crm_log_xml_trace(msg, "Peer[inbound]"); stonith_command(NULL, 0, 0, msg, remote_peer); } #if SUPPORT_HEARTBEAT static void stonith_peer_hb_callback(HA_Message * msg, void *private_data) { xmlNode *xml = convert_ha_message(NULL, msg, __FUNCTION__); stonith_peer_callback(xml, private_data); free_xml(xml); } static void stonith_peer_hb_destroy(gpointer user_data) { if (stonith_shutdown_flag) { crm_info("Heartbeat disconnection complete... exiting"); } else { crm_err("Heartbeat connection lost! Exiting."); } stonith_shutdown(0); } #endif #if SUPPORT_COROSYNC static void stonith_peer_ais_callback(cpg_handle_t handle, const struct cpg_name *groupName, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { uint32_t kind = 0; xmlNode *xml = NULL; const char *from = NULL; char *data = pcmk_message_common_cs(handle, nodeid, pid, msg, &kind, &from); if(data == NULL) { return; } if (kind == crm_class_cluster) { xml = string2xml(data); if (xml == NULL) { crm_err("Invalid XML: '%.120s'", data); free(data); return; } crm_xml_add(xml, F_ORIG, from); /* crm_xml_add_int(xml, F_SEQ, wrapper->id); */ stonith_peer_callback(xml, NULL); } free_xml(xml); free(data); return; } static void stonith_peer_cs_destroy(gpointer user_data) { crm_err("Corosync connection terminated"); stonith_shutdown(0); } #endif void do_local_reply(xmlNode * notify_src, const char *client_id, gboolean sync_reply, gboolean from_peer) { /* send callback to originating child */ crm_client_t *client_obj = NULL; int local_rc = pcmk_ok; crm_trace("Sending response"); client_obj = crm_client_get_by_id(client_id); crm_trace("Sending callback to request originator"); if (client_obj == NULL) { local_rc = -1; crm_trace("No client to sent the response to. F_STONITH_CLIENTID not set."); } else { int rid = 0; if (sync_reply) { CRM_LOG_ASSERT(client_obj->request_id); rid = client_obj->request_id; client_obj->request_id = 0; crm_trace("Sending response %d to %s %s", rid, client_obj->name, from_peer ? "(originator of delegated request)" : ""); } else { crm_trace("Sending an event to %s %s", client_obj->name, from_peer ? "(originator of delegated request)" : ""); } local_rc = crm_ipcs_send(client_obj, rid, notify_src, sync_reply?crm_ipc_flags_none:crm_ipc_server_event); } if (local_rc < pcmk_ok && client_obj != NULL) { crm_warn("%sSync reply to %s failed: %s", sync_reply ? "" : "A-", client_obj ? client_obj->name : "", pcmk_strerror(local_rc)); } } long long get_stonith_flag(const char *name) { if (safe_str_eq(name, T_STONITH_NOTIFY_FENCE)) { return 0x01; } else if (safe_str_eq(name, STONITH_OP_DEVICE_ADD)) { return 0x04; } else if (safe_str_eq(name, STONITH_OP_DEVICE_DEL)) { return 0x10; } return 0; } static void stonith_notify_client(gpointer key, gpointer value, gpointer user_data) { xmlNode *update_msg = user_data; crm_client_t *client = value; const char *type = NULL; CRM_CHECK(client != NULL, return); CRM_CHECK(update_msg != NULL, return); type = crm_element_value(update_msg, F_SUBTYPE); CRM_CHECK(type != NULL, crm_log_xml_err(update_msg, "notify"); return); if (client->ipcs == NULL) { crm_trace("Skipping client with NULL channel"); return; } if (client->options & get_stonith_flag(type)) { int rc = crm_ipcs_send(client, 0, update_msg, crm_ipc_server_event | crm_ipc_server_error); if (rc <= 0) { crm_warn("%s notification of client %s.%.6s failed: %s (%d)", type, crm_client_name(client), client->id, pcmk_strerror(rc), rc); } else { crm_trace("Sent %s notification to client %s.%.6s", type, crm_client_name(client), client->id); } } } void do_stonith_async_timeout_update(const char *client_id, const char *call_id, int timeout) { crm_client_t *client = NULL; xmlNode *notify_data = NULL; if (!timeout || !call_id || !client_id) { return; } client = crm_client_get_by_id(client_id); if (!client) { return; } notify_data = create_xml_node(NULL, T_STONITH_TIMEOUT_VALUE); crm_xml_add(notify_data, F_TYPE, T_STONITH_TIMEOUT_VALUE); crm_xml_add(notify_data, F_STONITH_CALLID, call_id); crm_xml_add_int(notify_data, F_STONITH_TIMEOUT, timeout); crm_trace("timeout update is %d for client %s and call id %s", timeout, client_id, call_id); if (client) { crm_ipcs_send(client, 0, notify_data, crm_ipc_server_event); } free_xml(notify_data); } void do_stonith_notify(int options, const char *type, int result, xmlNode * data) { /* TODO: Standardize the contents of data */ xmlNode *update_msg = create_xml_node(NULL, "notify"); CRM_CHECK(type != NULL,;); crm_xml_add(update_msg, F_TYPE, T_STONITH_NOTIFY); crm_xml_add(update_msg, F_SUBTYPE, type); crm_xml_add(update_msg, F_STONITH_OPERATION, type); crm_xml_add_int(update_msg, F_STONITH_RC, result); if (data != NULL) { add_message_xml(update_msg, F_STONITH_CALLDATA, data); } crm_trace("Notifying clients"); g_hash_table_foreach(client_connections, stonith_notify_client, update_msg); free_xml(update_msg); crm_trace("Notify complete"); } static void do_stonith_notify_config(int options, const char *op, int rc, const char *desc, int active) { xmlNode *notify_data = create_xml_node(NULL, op); CRM_CHECK(notify_data != NULL, return); crm_xml_add(notify_data, F_STONITH_DEVICE, desc); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); do_stonith_notify(options, op, rc, notify_data); free_xml(notify_data); } void do_stonith_notify_device(int options, const char *op, int rc, const char *desc) { do_stonith_notify_config(options, op, rc, desc, g_hash_table_size(device_list)); } void do_stonith_notify_level(int options, const char *op, int rc, const char *desc) { do_stonith_notify_config(options, op, rc, desc, g_hash_table_size(topology)); } static void topology_remove_helper(const char *node, int level) { int rc; char *desc = NULL; xmlNode *data = create_xml_node(NULL, XML_TAG_FENCING_LEVEL); crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__); crm_xml_add_int(data, XML_ATTR_STONITH_INDEX, level); crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); rc = stonith_level_remove(data, &desc); do_stonith_notify_level(0, STONITH_OP_LEVEL_DEL, rc, desc); free_xml(data); free(desc); } static void remove_cib_device(xmlXPathObjectPtr xpathObj) { int max = numXpathResults(xpathObj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { const char *rsc_id = NULL; const char *standard = NULL; xmlNode *match = getXpathResult(xpathObj, lpc); CRM_LOG_ASSERT(match != NULL); if(match != NULL) { standard = crm_element_value(match, XML_AGENT_ATTR_CLASS); } if (safe_str_neq(standard, PCMK_RESOURCE_CLASS_STONITH)) { continue; } rsc_id = crm_element_value(match, XML_ATTR_ID); stonith_device_remove(rsc_id, TRUE); } } static void handle_topology_change(xmlNode *match, bool remove) { int rc; char *desc = NULL; CRM_CHECK(match != NULL, return); crm_trace("Updating %s", ID(match)); if(remove) { int index = 0; char *key = stonith_level_key(match, -1); crm_element_value_int(match, XML_ATTR_STONITH_INDEX, &index); topology_remove_helper(key, index); free(key); } rc = stonith_level_register(match, &desc); do_stonith_notify_level(0, STONITH_OP_LEVEL_ADD, rc, desc); free(desc); } static void remove_fencing_topology(xmlXPathObjectPtr xpathObj) { int max = numXpathResults(xpathObj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpathObj, lpc); CRM_LOG_ASSERT(match != NULL); if (match && crm_element_value(match, XML_DIFF_MARKER)) { /* Deletion */ int index = 0; char *target = stonith_level_key(match, -1); crm_element_value_int(match, XML_ATTR_STONITH_INDEX, &index); if (target == NULL) { crm_err("Invalid fencing target in element %s", ID(match)); } else if (index <= 0) { crm_err("Invalid level for %s in element %s", target, ID(match)); } else { topology_remove_helper(target, index); } /* } else { Deal with modifications during the 'addition' stage */ } } } static void register_fencing_topology(xmlXPathObjectPtr xpathObj) { int max = numXpathResults(xpathObj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpathObj, lpc); handle_topology_change(match, TRUE); } } /* Fencing */ static void fencing_topology_init() { xmlXPathObjectPtr xpathObj = NULL; const char *xpath = "//" XML_TAG_FENCING_LEVEL; crm_trace("Full topology refresh"); if(topology) { g_hash_table_destroy(topology); topology = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, free_topology_entry); } /* Grab everything */ xpathObj = xpath_search(local_cib, xpath); register_fencing_topology(xpathObj); freeXpathObject(xpathObj); } #define rsc_name(x) x->clone_name?x->clone_name:x->id /*! * \internal * \brief Check whether our uname is in a resource's allowed node list * * \param[in] rsc Resource to check * * \return Pointer to node object if found, NULL otherwise */ static node_t * our_node_allowed_for(resource_t *rsc) { GHashTableIter iter; node_t *node = NULL; if (rsc && stonith_our_uname) { g_hash_table_iter_init(&iter, rsc->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { if (node && strcmp(node->details->uname, stonith_our_uname) == 0) { break; } node = NULL; } } return node; } /*! * \internal * \brief If a resource or any of its children are STONITH devices, update their * definitions given a cluster working set. * * \param[in] rsc Resource to check * \param[in] data_set Cluster working set with device information */ static void cib_device_update(resource_t *rsc, pe_working_set_t *data_set) { node_t *node = NULL; const char *value = NULL; const char *rclass = NULL; node_t *parent = NULL; gboolean remove = TRUE; /* If this is a complex resource, check children rather than this resource itself. * TODO: Mark each installed device and remove if untouched when this process finishes. */ if(rsc->children) { GListPtr gIter = NULL; for (gIter = rsc->children; gIter != NULL; gIter = gIter->next) { cib_device_update(gIter->data, data_set); if(pe_rsc_is_clone(rsc)) { crm_trace("Only processing one copy of the clone %s", rsc->id); break; } } return; } /* We only care about STONITH resources. */ rclass = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); if (safe_str_neq(rclass, PCMK_RESOURCE_CLASS_STONITH)) { return; } /* If this STONITH resource is disabled, just remove it. */ value = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_TARGET_ROLE); if (safe_str_eq(value, RSC_STOPPED)) { crm_info("Device %s has been disabled", rsc->id); goto update_done; } /* Check whether our node is allowed for this resource (and its parent if in a group) */ node = our_node_allowed_for(rsc); if (rsc->parent && (rsc->parent->variant == pe_group)) { parent = our_node_allowed_for(rsc->parent); } if(node == NULL) { /* Our node is disallowed, so remove the device */ GHashTableIter iter; crm_info("Device %s has been disabled on %s: unknown", rsc->id, stonith_our_uname); g_hash_table_iter_init(&iter, rsc->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { crm_trace("Available: %s = %d", node->details->uname, node->weight); } goto update_done; } else if(node->weight < 0 || (parent && parent->weight < 0)) { /* Our node (or its group) is disallowed by score, so remove the device */ char *score = score2char((node->weight < 0) ? node->weight : parent->weight); crm_info("Device %s has been disabled on %s: score=%s", rsc->id, stonith_our_uname, score); free(score); goto update_done; } else { /* Our node is allowed, so update the device information */ xmlNode *data; GHashTableIter gIter; stonith_key_value_t *params = NULL; const char *name = NULL; const char *agent = crm_element_value(rsc->xml, XML_EXPR_ATTR_TYPE); const char *provider = crm_element_value(rsc->xml, XML_AGENT_ATTR_PROVIDER); const char *rsc_provides = NULL; crm_debug("Device %s is allowed on %s: score=%d", rsc->id, stonith_our_uname, node->weight); get_rsc_attributes(rsc->parameters, rsc, node, data_set); get_meta_attributes(rsc->meta, rsc, node, data_set); rsc_provides = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_PROVIDES); g_hash_table_iter_init(&gIter, rsc->parameters); while (g_hash_table_iter_next(&gIter, (gpointer *) & name, (gpointer *) & value)) { if (!name || !value) { continue; } params = stonith_key_value_add(params, name, value); crm_trace(" %s=%s", name, value); } remove = FALSE; data = create_device_registration_xml(rsc_name(rsc), provider, agent, params, rsc_provides); stonith_device_register(data, NULL, TRUE); stonith_key_value_freeall(params, 1, 1); free_xml(data); } update_done: if(remove && g_hash_table_lookup(device_list, rsc_name(rsc))) { stonith_device_remove(rsc_name(rsc), TRUE); } } extern xmlNode *do_calculations(pe_working_set_t * data_set, xmlNode * xml_input, crm_time_t * now); /*! * \internal * \brief Update all STONITH device definitions based on current CIB */ static void cib_devices_update(void) { GListPtr gIter = NULL; pe_working_set_t data_set; crm_info("Updating devices to version %s.%s.%s", crm_element_value(local_cib, XML_ATTR_GENERATION_ADMIN), crm_element_value(local_cib, XML_ATTR_GENERATION), crm_element_value(local_cib, XML_ATTR_NUMUPDATES)); set_working_set_defaults(&data_set); data_set.input = local_cib; data_set.now = crm_time_new(NULL); data_set.flags |= pe_flag_quick_location; data_set.localhost = stonith_our_uname; cluster_status(&data_set); do_calculations(&data_set, NULL, NULL); for (gIter = data_set.resources; gIter != NULL; gIter = gIter->next) { cib_device_update(gIter->data, &data_set); } data_set.input = NULL; /* Wasn't a copy */ cleanup_alloc_calculations(&data_set); } static void update_cib_stonith_devices_v2(const char *event, xmlNode * msg) { xmlNode *change = NULL; char *reason = NULL; bool needs_update = FALSE; xmlNode *patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); for (change = __xml_first_child(patchset); change != NULL; change = __xml_next(change)) { const char *op = crm_element_value(change, XML_DIFF_OP); const char *xpath = crm_element_value(change, XML_DIFF_PATH); const char *shortpath = NULL; if(op == NULL || strcmp(op, "move") == 0) { continue; } else if(safe_str_eq(op, "delete") && strstr(xpath, XML_CIB_TAG_RESOURCE)) { const char *rsc_id = NULL; char *search = NULL; char *mutable = NULL; if (strstr(xpath, XML_TAG_ATTR_SETS)) { needs_update = TRUE; break; } mutable = strdup(xpath); rsc_id = strstr(mutable, "primitive[@id=\'"); if (rsc_id != NULL) { rsc_id += strlen("primitive[@id=\'"); search = strchr(rsc_id, '\''); } if (search != NULL) { *search = 0; stonith_device_remove(rsc_id, TRUE); } else { crm_warn("Ignoring malformed CIB update (resource deletion)"); } free(mutable); } else if(strstr(xpath, "/"XML_CIB_TAG_RESOURCES)) { shortpath = strrchr(xpath, '/'); CRM_ASSERT(shortpath); reason = crm_strdup_printf("%s %s", op, shortpath+1); needs_update = TRUE; break; } else if(strstr(xpath, XML_CIB_TAG_CONSTRAINTS)) { shortpath = strrchr(xpath, '/'); CRM_ASSERT(shortpath); reason = crm_strdup_printf("%s %s", op, shortpath+1); needs_update = TRUE; break; } } if(needs_update) { crm_info("Updating device list from the cib: %s", reason); cib_devices_update(); } else { crm_trace("No updates for device list found in cib"); } free(reason); } static void update_cib_stonith_devices_v1(const char *event, xmlNode * msg) { const char *reason = "none"; gboolean needs_update = FALSE; xmlXPathObjectPtr xpath_obj = NULL; /* process new constraints */ xpath_obj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_CONS_TAG_RSC_LOCATION); if (numXpathResults(xpath_obj) > 0) { int max = numXpathResults(xpath_obj), lpc = 0; /* Safest and simplest to always recompute */ needs_update = TRUE; reason = "new location constraint"; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpath_obj, lpc); crm_log_xml_trace(match, "new constraint"); } } freeXpathObject(xpath_obj); /* process deletions */ xpath_obj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_CIB_TAG_RESOURCE); if (numXpathResults(xpath_obj) > 0) { remove_cib_device(xpath_obj); } freeXpathObject(xpath_obj); /* process additions */ xpath_obj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_CIB_TAG_RESOURCE); if (numXpathResults(xpath_obj) > 0) { int max = numXpathResults(xpath_obj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { const char *rsc_id = NULL; const char *standard = NULL; xmlNode *match = getXpathResult(xpath_obj, lpc); rsc_id = crm_element_value(match, XML_ATTR_ID); standard = crm_element_value(match, XML_AGENT_ATTR_CLASS); if (safe_str_neq(standard, PCMK_RESOURCE_CLASS_STONITH)) { continue; } crm_trace("Fencing resource %s was added or modified", rsc_id); reason = "new resource"; needs_update = TRUE; } } freeXpathObject(xpath_obj); if(needs_update) { crm_info("Updating device list from the cib: %s", reason); cib_devices_update(); } } static void update_cib_stonith_devices(const char *event, xmlNode * msg) { int format = 1; xmlNode *patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); CRM_ASSERT(patchset); crm_element_value_int(patchset, "format", &format); switch(format) { case 1: update_cib_stonith_devices_v1(event, msg); break; case 2: update_cib_stonith_devices_v2(event, msg); break; default: crm_warn("Unknown patch format: %d", format); } } /* Needs to hold node name + attribute name + attribute value + 75 */ #define XPATH_MAX 512 /*! * \internal * \brief Check whether a node has a specific attribute name/value * * \param[in] node Name of node to check * \param[in] name Name of an attribute to look for * \param[in] value The value the named attribute needs to be set to in order to be considered a match * * \return TRUE if the locally cached CIB has the specified node attribute */ gboolean node_has_attr(const char *node, const char *name, const char *value) { char xpath[XPATH_MAX]; xmlNode *match; int n; CRM_CHECK(local_cib != NULL, return FALSE); /* Search for the node's attributes in the CIB. While the schema allows * multiple sets of instance attributes, and allows instance attributes to * use id-ref to reference values elsewhere, that is intended for resources, * so we ignore that here. */ n = snprintf(xpath, XPATH_MAX, "//" XML_CIB_TAG_NODES "/" XML_CIB_TAG_NODE "[@uname='%s']/" XML_TAG_ATTR_SETS "/" XML_CIB_TAG_NVPAIR "[@name='%s' and @value='%s']", node, name, value); match = get_xpath_object(xpath, local_cib, LOG_TRACE); CRM_CHECK(n < XPATH_MAX, return FALSE); return (match != NULL); } static void update_fencing_topology(const char *event, xmlNode * msg) { int format = 1; const char *xpath; xmlXPathObjectPtr xpathObj = NULL; xmlNode *patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); CRM_ASSERT(patchset); crm_element_value_int(patchset, "format", &format); if(format == 1) { /* Process deletions (only) */ xpath = "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_TAG_FENCING_LEVEL; xpathObj = xpath_search(msg, xpath); remove_fencing_topology(xpathObj); freeXpathObject(xpathObj); /* Process additions and changes */ xpath = "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_TAG_FENCING_LEVEL; xpathObj = xpath_search(msg, xpath); register_fencing_topology(xpathObj); freeXpathObject(xpathObj); } else if(format == 2) { xmlNode *change = NULL; int add[] = { 0, 0, 0 }; int del[] = { 0, 0, 0 }; xml_patch_versions(patchset, add, del); for (change = __xml_first_child(patchset); change != NULL; change = __xml_next(change)) { const char *op = crm_element_value(change, XML_DIFF_OP); const char *xpath = crm_element_value(change, XML_DIFF_PATH); if(op == NULL) { continue; } else if(strstr(xpath, "/" XML_TAG_FENCING_LEVEL) != NULL) { /* Change to a specific entry */ crm_trace("Handling %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); if(strcmp(op, "move") == 0) { continue; } else if(strcmp(op, "create") == 0) { handle_topology_change(change->children, FALSE); } else if(strcmp(op, "modify") == 0) { xmlNode *match = first_named_child(change, XML_DIFF_RESULT); if(match) { handle_topology_change(match->children, TRUE); } } else if(strcmp(op, "delete") == 0) { /* Nuclear option, all we have is the path and an id... not enough to remove a specific entry */ crm_info("Re-initializing fencing topology after %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); fencing_topology_init(); return; } } else if (strstr(xpath, "/" XML_TAG_FENCING_TOPOLOGY) != NULL) { /* Change to the topology in general */ crm_info("Re-initializing fencing topology after top-level %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); fencing_topology_init(); return; } else if (strstr(xpath, "/" XML_CIB_TAG_CONFIGURATION)) { /* Changes to the whole config section, possibly including the topology as a whild */ if(first_named_child(change, XML_TAG_FENCING_TOPOLOGY) == NULL) { crm_trace("Nothing for us in %s operation %d.%d.%d for %s.", op, add[0], add[1], add[2], xpath); } else if(strcmp(op, "delete") == 0 || strcmp(op, "create") == 0) { crm_info("Re-initializing fencing topology after top-level %s operation %d.%d.%d for %s.", op, add[0], add[1], add[2], xpath); fencing_topology_init(); return; } } else { crm_trace("Nothing for us in %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); } } } else { crm_warn("Unknown patch format: %d", format); } } static bool have_cib_devices = FALSE; static void update_cib_cache_cb(const char *event, xmlNode * msg) { int rc = pcmk_ok; xmlNode *stonith_enabled_xml = NULL; xmlNode *stonith_watchdog_xml = NULL; const char *stonith_enabled_s = NULL; static gboolean stonith_enabled_saved = TRUE; if(!have_cib_devices) { crm_trace("Skipping updates until we get a full dump"); return; } else if(msg == NULL) { crm_trace("Missing %s update", event); return; } /* Maintain a local copy of the CIB so that we have full access * to device definitions, location constraints, and node attributes */ if (local_cib != NULL) { int rc = pcmk_ok; xmlNode *patchset = NULL; crm_element_value_int(msg, F_CIB_RC, &rc); if (rc != pcmk_ok) { return; } patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); xml_log_patchset(LOG_TRACE, "Config update", patchset); rc = xml_apply_patchset(local_cib, patchset, TRUE); switch (rc) { case pcmk_ok: case -pcmk_err_old_data: break; case -pcmk_err_diff_resync: case -pcmk_err_diff_failed: crm_notice("[%s] Patch aborted: %s (%d)", event, pcmk_strerror(rc), rc); free_xml(local_cib); local_cib = NULL; break; default: crm_warn("[%s] ABORTED: %s (%d)", event, pcmk_strerror(rc), rc); free_xml(local_cib); local_cib = NULL; } } if (local_cib == NULL) { crm_trace("Re-requesting the full cib"); rc = cib_api->cmds->query(cib_api, NULL, &local_cib, cib_scope_local | cib_sync_call); if(rc != pcmk_ok) { crm_err("Couldn't retrieve the CIB: %s (%d)", pcmk_strerror(rc), rc); return; } CRM_ASSERT(local_cib != NULL); stonith_enabled_saved = FALSE; /* Trigger a full refresh below */ } stonith_enabled_xml = get_xpath_object("//nvpair[@name='stonith-enabled']", local_cib, LOG_TRACE); if (stonith_enabled_xml) { stonith_enabled_s = crm_element_value(stonith_enabled_xml, XML_NVPAIR_ATTR_VALUE); } if (stonith_enabled_s == NULL || crm_is_true(stonith_enabled_s)) { long timeout_ms = 0; const char *value = NULL; stonith_watchdog_xml = get_xpath_object("//nvpair[@name='stonith-watchdog-timeout']", local_cib, LOG_TRACE); if (stonith_watchdog_xml) { value = crm_element_value(stonith_watchdog_xml, XML_NVPAIR_ATTR_VALUE); } if(value) { timeout_ms = crm_get_msec(value); } if(timeout_ms != stonith_watchdog_timeout_ms) { crm_notice("New watchdog timeout %lds (was %lds)", timeout_ms/1000, stonith_watchdog_timeout_ms/1000); stonith_watchdog_timeout_ms = timeout_ms; } } else { stonith_watchdog_timeout_ms = 0; } if (stonith_enabled_s && crm_is_true(stonith_enabled_s) == FALSE) { crm_trace("Ignoring cib updates while stonith is disabled"); stonith_enabled_saved = FALSE; return; } else if (stonith_enabled_saved == FALSE) { crm_info("Updating stonith device and topology lists now that stonith is enabled"); stonith_enabled_saved = TRUE; fencing_topology_init(); cib_devices_update(); } else { update_fencing_topology(event, msg); update_cib_stonith_devices(event, msg); } } static void init_cib_cache_cb(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { crm_info("Updating device list from the cib: init"); have_cib_devices = TRUE; local_cib = copy_xml(output); fencing_topology_init(); cib_devices_update(); } static void stonith_shutdown(int nsig) { stonith_shutdown_flag = TRUE; crm_info("Terminating with %d clients", crm_hash_table_size(client_connections)); if (mainloop != NULL && g_main_is_running(mainloop)) { g_main_quit(mainloop); } else { stonith_cleanup(); crm_exit(pcmk_ok); } } static void cib_connection_destroy(gpointer user_data) { if (stonith_shutdown_flag) { crm_info("Connection to the CIB closed."); return; } else { crm_notice("Connection to the CIB terminated. Shutting down."); } if (cib_api) { cib_api->cmds->signoff(cib_api); } stonith_shutdown(0); } static void stonith_cleanup(void) { if (cib_api) { cib_api->cmds->signoff(cib_api); } if (ipcs) { qb_ipcs_destroy(ipcs); } g_hash_table_destroy(known_peer_names); known_peer_names = NULL; crm_peer_destroy(); crm_client_cleanup(); free(stonith_our_uname); free_xml(local_cib); } /* *INDENT-OFF* */ static struct crm_option long_options[] = { {"stand-alone", 0, 0, 's'}, {"stand-alone-w-cpg", 0, 0, 'c'}, {"logfile", 1, 0, 'l'}, {"verbose", 0, 0, 'V'}, {"version", 0, 0, '$'}, {"help", 0, 0, '?'}, {0, 0, 0, 0} }; /* *INDENT-ON* */ static void setup_cib(void) { int rc, retries = 0; static cib_t *(*cib_new_fn) (void) = NULL; if (cib_new_fn == NULL) { cib_new_fn = find_library_function(&cib_library, CIB_LIBRARY, "cib_new", TRUE); } if (cib_new_fn != NULL) { cib_api = (*cib_new_fn) (); } if (cib_api == NULL) { crm_err("No connection to the CIB"); return; } do { sleep(retries); rc = cib_api->cmds->signon(cib_api, CRM_SYSTEM_STONITHD, cib_command); } while (rc == -ENOTCONN && ++retries < 5); if (rc != pcmk_ok) { crm_err("Could not connect to the CIB service: %s (%d)", pcmk_strerror(rc), rc); } else if (pcmk_ok != cib_api->cmds->add_notify_callback(cib_api, T_CIB_DIFF_NOTIFY, update_cib_cache_cb)) { crm_err("Could not set CIB notification callback"); } else { rc = cib_api->cmds->query(cib_api, NULL, NULL, cib_scope_local); cib_api->cmds->register_callback(cib_api, rc, 120, FALSE, NULL, "init_cib_cache_cb", init_cib_cache_cb); cib_api->cmds->set_connection_dnotify(cib_api, cib_connection_destroy); crm_info("Watching for stonith topology changes"); } } struct qb_ipcs_service_handlers ipc_callbacks = { .connection_accept = st_ipc_accept, .connection_created = st_ipc_created, .msg_process = st_ipc_dispatch, .connection_closed = st_ipc_closed, .connection_destroyed = st_ipc_destroy }; /*! * \internal * \brief Callback for peer status changes * * \param[in] type What changed * \param[in] node What peer had the change * \param[in] data Previous value of what changed */ static void st_peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) { if ((type != crm_status_processes) && !is_set(node->flags, crm_remote_node)) { xmlNode *query = NULL; if (node->id && node->uname) { g_hash_table_insert(known_peer_names, GUINT_TO_POINTER(node->id), strdup(node->uname)); } /* * This is a hack until we can send to a nodeid and/or we fix node name lookups * These messages are ignored in stonith_peer_callback() */ query = create_xml_node(NULL, "stonith_command"); crm_xml_add(query, F_XML_TAGNAME, "stonith_command"); crm_xml_add(query, F_TYPE, T_STONITH_NG); crm_xml_add(query, F_STONITH_OPERATION, "poke"); crm_debug("Broadcasting our uname because of node %u", node->id); send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE); free_xml(query); } } int main(int argc, char **argv) { int flag; int rc = 0; int lpc = 0; int argerr = 0; int option_index = 0; crm_cluster_t cluster; const char *actions[] = { "reboot", "off", "list", "monitor", "status" }; crm_log_preinit("stonith-ng", argc, argv); crm_set_options(NULL, "mode [options]", long_options, "Provides a summary of cluster's current state." "\n\nOutputs varying levels of detail in a number of different formats.\n"); while (1) { flag = crm_get_option(argc, argv, &option_index); if (flag == -1) { break; } switch (flag) { case 'V': crm_bump_log_level(argc, argv); break; case 'l': crm_add_logfile(optarg); break; case 's': stand_alone = TRUE; break; case 'c': stand_alone = FALSE; no_cib_connect = TRUE; break; case '$': case '?': crm_help(flag, EX_OK); break; default: ++argerr; break; } } if (argc - optind == 1 && safe_str_eq("metadata", argv[optind])) { printf("\n"); printf("\n"); printf(" 1.0\n"); printf (" This is a fake resource that details the instance attributes handled by stonithd.\n"); printf(" Options available for all stonith resources\n"); printf(" \n"); printf(" \n"); printf (" The priority of the stonith resource. Devices are tried in order of highest priority to lowest.\n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_HOSTARG); printf (" Advanced use only: An alternate parameter to supply instead of 'port'\n"); printf (" Some devices do not support the standard 'port' parameter or may provide additional ones.\n" "Use this to specify an alternate, device-specific, parameter that should indicate the machine to be fenced.\n" "A value of 'none' can be used to tell the cluster not to supply any additional parameters.\n" " \n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_HOSTMAP); printf (" A mapping of host names to ports numbers for devices that do not support host names.\n"); printf (" Eg. node1:1;node2:2,3 would tell the cluster to use port 1 for node1 and ports 2 and 3 for node2\n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_HOSTLIST); printf (" A list of machines controlled by this device (Optional unless %s=static-list).\n", STONITH_ATTR_HOSTCHECK); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_HOSTCHECK); printf (" How to determine which machines are controlled by the device.\n"); printf (" Allowed values: dynamic-list (query the device), static-list (check the %s attribute), none (assume every device can fence every machine)\n", STONITH_ATTR_HOSTLIST); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_DELAY_MAX); printf - (" Enable random delay for stonith actions and specify the maximum of random delay\n"); + (" Enable a random delay for stonith actions and specify the maximum of random delay.\n"); printf (" This prevents double fencing when using slow devices such as sbd.\n" - "Use this to enable random delay for stonith actions.\n" - "The overall delay is derived from a random delay value adding a static delay so that the sum is kept below the maximum delay.\n"); + "Use this to enable a random delay for stonith actions.\n" + "The overall delay is derived from this random delay value adding a static delay so that the sum is kept below the maximum delay.\n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_DELAY_BASE); printf - (" Enable base delay for stonith actions and specify base delay value\n"); + (" Enable a base delay for stonith actions and specify base delay value.\n"); printf (" This prevents double fencing when different delays are configured on the nodes.\n" - "Use this to enable static delay for stonith actions.\n" - "The overall delay is derived from a random delay value adding a static delay so that the sum is kept below the maximum delay.\n"); + "Use this to enable a static delay for stonith actions.\n" + "The overall delay is derived from a random delay value adding this static delay so that the sum is kept below the maximum delay.\n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_ACTION_LIMIT); printf (" The maximum number of actions can be performed in parallel on this device\n"); printf (" Pengine property concurrent-fencing=true needs to be configured first.\n" "Then use this to specify the maximum number of actions can be performed in parallel on this device. -1 is unlimited.\n"); printf(" \n"); printf(" \n"); for (lpc = 0; lpc < DIMOF(actions); lpc++) { printf(" \n", actions[lpc]); printf (" Advanced use only: An alternate command to run instead of '%s'\n", actions[lpc]); printf (" Some devices do not support the standard commands or may provide additional ones.\n" "Use this to specify an alternate, device-specific, command that implements the '%s' action.\n", actions[lpc]); printf(" \n", actions[lpc]); printf(" \n"); printf(" \n", actions[lpc]); printf (" Advanced use only: Specify an alternate timeout to use for %s actions instead of stonith-timeout\n", actions[lpc]); printf (" Some devices need much more/less time to complete than normal.\n" "Use this to specify an alternate, device-specific, timeout for '%s' actions.\n", actions[lpc]); printf(" \n"); printf(" \n"); printf(" \n", actions[lpc]); printf (" Advanced use only: The maximum number of times to retry the '%s' command within the timeout period\n", actions[lpc]); printf(" Some devices do not support multiple connections." " Operations may 'fail' if the device is busy with another task so Pacemaker will automatically retry the operation, if there is time remaining." " Use this option to alter the number of times Pacemaker retries '%s' actions before giving up." "\n", actions[lpc]); printf(" \n"); printf(" \n"); } printf(" \n"); printf("\n"); return 0; } if (optind != argc) { ++argerr; } if (argerr) { crm_help('?', EX_USAGE); } crm_log_init("stonith-ng", LOG_INFO, TRUE, FALSE, argc, argv, FALSE); mainloop_add_signal(SIGTERM, stonith_shutdown); crm_peer_init(); known_peer_names = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, free); if (stand_alone == FALSE) { #if SUPPORT_HEARTBEAT cluster.hb_conn = NULL; cluster.hb_dispatch = stonith_peer_hb_callback; cluster.destroy = stonith_peer_hb_destroy; #endif if (is_openais_cluster()) { #if SUPPORT_COROSYNC cluster.destroy = stonith_peer_cs_destroy; cluster.cpg.cpg_deliver_fn = stonith_peer_ais_callback; cluster.cpg.cpg_confchg_fn = pcmk_cpg_membership; #endif } crm_set_status_callback(&st_peer_update_callback); if (crm_cluster_connect(&cluster) == FALSE) { crm_crit("Cannot sign in to the cluster... terminating"); crm_exit(DAEMON_RESPAWN_STOP); } stonith_our_uname = cluster.uname; stonith_our_uuid = cluster.uuid; #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { /* crm_cluster_connect() registered us for crm_system_name, which * usually is the only F_TYPE used by the respective sub system. * Stonith needs to register two additional F_TYPE callbacks, * because it can :-/ */ if (HA_OK != cluster.hb_conn->llc_ops->set_msg_callback(cluster.hb_conn, T_STONITH_NOTIFY, cluster.hb_dispatch, cluster.hb_conn)) { crm_crit("Cannot set msg callback %s: %s", T_STONITH_NOTIFY, cluster.hb_conn->llc_ops->errmsg(cluster.hb_conn)); crm_exit(DAEMON_RESPAWN_STOP); } if (HA_OK != cluster.hb_conn->llc_ops->set_msg_callback(cluster.hb_conn, T_STONITH_TIMEOUT_VALUE, cluster.hb_dispatch, cluster.hb_conn)) { crm_crit("Cannot set msg callback %s: %s", T_STONITH_TIMEOUT_VALUE, cluster.hb_conn->llc_ops->errmsg(cluster.hb_conn)); crm_exit(DAEMON_RESPAWN_STOP); } } #endif if (no_cib_connect == FALSE) { setup_cib(); } } else { stonith_our_uname = strdup("localhost"); } device_list = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, free_device); topology = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, free_topology_entry); if(stonith_watchdog_timeout_ms > 0) { xmlNode *xml; stonith_key_value_t *params = NULL; params = stonith_key_value_add(params, STONITH_ATTR_HOSTLIST, stonith_our_uname); xml = create_device_registration_xml("watchdog", "internal", STONITH_WATCHDOG_AGENT, params, NULL); stonith_device_register(xml, NULL, FALSE); stonith_key_value_freeall(params, 1, 1); free_xml(xml); } stonith_ipc_server_init(&ipcs, &ipc_callbacks); #if SUPPORT_STONITH_CONFIG if (((stand_alone == TRUE)) && !(standalone_cfg_read_file(STONITH_NG_CONF_FILE))) { standalone_cfg_commit(); } #endif /* Create the mainloop and run it... */ mainloop = g_main_new(FALSE); crm_info("Starting %s mainloop", crm_system_name); g_main_run(mainloop); stonith_cleanup(); #if SUPPORT_HEARTBEAT if (cluster.hb_conn) { cluster.hb_conn->llc_ops->delete(cluster.hb_conn); } #endif crm_info("Done"); return crm_exit(rc); }