diff --git a/Doxyfile.in b/Doxyfile.in new file mode 100644 index 0000000000..81f21d6ddd --- /dev/null +++ b/Doxyfile.in @@ -0,0 +1,1720 @@ +# Doxyfile 1.7.4 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" "). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = @PACKAGE_NAME@ + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = @PACKAGE_VERSION@-@BUILD_VERSION@ + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer +# a quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = "Scalable High-Availability cluster resource manager" + +# With the PROJECT_LOGO tag one can specify an logo or icon that is +# included in the documentation. The maximum height of the logo should not +# exceed 55 pixels and the maximum width should not exceed 200 pixels. +# Doxygen will copy the logo to the output directory. + +PROJECT_LOGO = doc/publican-clusterlabs/en-US/images/title_logo.png + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = doc/api/ + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, +# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English +# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, +# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, +# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = YES + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful if your file system +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = YES + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given extension. +# Doxygen has a built-in mapping, but you can override or extend it using this +# tag. The format is ext=language, where ext is a file extension, and language +# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C, +# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make +# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C +# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions +# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also makes the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate getter +# and setter methods for a property. Setting this option to YES (the default) +# will make doxygen replace the get and set methods by a property in the +# documentation. This will only work if the methods are indeed getting or +# setting a simple type. If this is not the case, or you want to show the +# methods anyway, you should set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = YES + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and +# unions are shown inside the group in which they are included (e.g. using +# @ingroup) instead of on a separate page (for HTML and Man pages) or +# section (for LaTeX and RTF). + +INLINE_GROUPED_CLASSES = NO + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = NO + +# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to +# determine which symbols to keep in memory and which to flush to disk. +# When the cache is full, less often used symbols will be written to disk. +# For small to medium size projects (<1000 input files) the default value is +# probably good enough. For larger projects a too small cache size can cause +# doxygen to be busy swapping symbols to and from disk most of the time +# causing a significant performance penalty. +# If the system has enough physical memory increasing the cache will improve the +# performance by keeping more symbols in memory. Note that the value works on +# a logarithmic scale so increasing the size by one will roughly double the +# memory usage. The cache size is given by this formula: +# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols + +SYMBOL_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = NO + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespaces are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen +# will list include files with double quotes in the documentation +# rather than with sharp brackets. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen +# will sort the (brief and detailed) documentation of class members so that +# constructors and destructors are listed first. If set to NO (the default) +# the constructors will appear in the respective orders defined by +# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. +# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO +# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. + +SORT_MEMBERS_CTORS_1ST = YES + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to +# do proper type resolution of all parameters of a function it will reject a +# match between the prototype and the implementation of a member function even +# if there is only one candidate or it is obvious which candidate to choose +# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen +# will still accept a match between prototype and implementation in such cases. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or macro consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and macros in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# If the sources in your project are distributed over multiple directories +# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy +# in the documentation. The default is NO. + +SHOW_DIRECTORIES = NO + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. +# This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. The create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. +# You can optionally specify a file name after the option, if omitted +# DoxygenLayout.xml will be used as the name of the layout file. + +LAYOUT_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = YES + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# The WARN_NO_PARAMDOC option can be enabled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = include/crm include/crm_config.h include/doxygen.h + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh +# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py +# *.f90 *.f *.for *.vhd *.vhdl + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = . + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = YES + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. +# If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. +# Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. +# The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty or if +# non of the patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) +# and it is also possible to disable source filtering for a specific pattern +# using *.ext= (so without naming a filter). This option only has effect when +# FILTER_SOURCE_FILES is enabled. + +FILTER_SOURCE_PATTERNS = + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = YES + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. +# Otherwise they will link to the documentation. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = YES + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. Note that when using a custom header you are responsible +# for the proper inclusion of any scripts and style sheets that doxygen +# needs, which is dependent on the configuration options used. +# It is adviced to generate a default header using "doxygen -w html +# header.html footer.html stylesheet.css YourConfigFile" and then modify +# that header. Note that the header is subject to change so you typically +# have to redo this when upgrading to a newer version of doxygen or when changing the value of configuration settings such as GENERATE_TREEVIEW! + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# If the HTML_TIMESTAMP tag is set to YES then the generated HTML documentation will contain the timesstamp. + +HTML_TIMESTAMP = NO + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that +# the files will be copied as-is; there are no commands or markers available. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. +# Doxygen will adjust the colors in the stylesheet and background images +# according to this color. Hue is specified as an angle on a colorwheel, +# see http://en.wikipedia.org/wiki/Hue for more information. +# For instance the value 0 represents red, 60 is yellow, 120 is green, +# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. +# The allowed range is 0 to 359. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of +# the colors in the HTML output. For a value of 0 the output will use +# grayscales only. A value of 255 will produce the most vivid colors. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to +# the luminance component of the colors in the HTML output. Values below +# 100 gradually make the output lighter, whereas values above 100 make +# the output darker. The value divided by 100 is the actual gamma applied, +# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, +# and 100 does not change the gamma. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting +# this to NO can help when comparing the output of multiple runs. + +HTML_TIMESTAMP = YES + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. For this to work a browser that supports +# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox +# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). + +HTML_DYNAMIC_SECTIONS = NO + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. +# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Pacemaker + +# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. + +DOCSET_PUBLISHER_ID = org.doxygen.ClusterLabs + +# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. + +DOCSET_PUBLISHER_NAME = ClusterLabs + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated +# that can be used as input for Qt's qhelpgenerator to generate a +# Qt Compressed Help (.qch) of the generated HTML documentation. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can +# be used to specify the file name of the resulting .qch file. +# The path specified is relative to the HTML output folder. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#namespace + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#virtual-folders + +QHP_VIRTUAL_FOLDER = doc + +# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to +# add. For more information please see +# http://doc.trolltech.com/qthelpproject.html#custom-filters + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see +# +# Qt Help Project / Custom Filters. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's +# filter section matches. +# +# Qt Help Project / Filter Attributes. + +QHP_SECT_FILTER_ATTRS = + +# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can +# be used to specify the location of Qt's qhelpgenerator. +# If non-empty doxygen will try to run qhelpgenerator on the generated +# .qhp file. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files +# will be generated, which together with the HTML files, form an Eclipse help +# plugin. To install this plugin and make it available under the help contents +# menu in Eclipse, the contents of the directory containing the HTML and XML +# files needs to be copied into the plugins directory of eclipse. The name of +# the directory within the plugins directory should be the same as +# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before +# the help appears. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have +# this name. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values +# (range [0,1..20]) that doxygen will group on one line in the generated HTML +# documentation. Note that a value of 0 will completely suppress the enum +# values from appearing in the overview section. + +ENUM_VALUES_PER_LINE = 4 + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to YES, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). +# Windows users are probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, +# and Class Hierarchy pages using a tree view instead of an ordered list. + +USE_INLINE_TREES = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open +# links to external symbols imported via tag files in a separate window. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are +# not supported properly for IE 6.0, but are supported on all modern browsers. +# Note that when changing this option you need to delete any form_*.png files +# in the HTML output before the changes have effect. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax +# (see http://www.mathjax.org) which uses client side Javascript for the +# rendering instead of using prerendered bitmaps. Use this if you do not +# have LaTeX installed or if you want to formulas look prettier in the HTML +# output. When enabled you also need to install MathJax separately and +# configure the path to it using the MATHJAX_RELPATH option. + +USE_MATHJAX = NO + +# When MathJax is enabled you need to specify the location relative to the +# HTML output directory using the MATHJAX_RELPATH option. The destination +# directory should contain the MathJax.js script. For instance, if the mathjax +# directory is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the +# mathjax.org site, so you can quickly see the result without installing +# MathJax, but it is strongly recommended to install a local copy of MathJax +# before deployment. + +MATHJAX_RELPATH = http://www.mathjax.org/mathjax + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box +# for the HTML output. The underlying search engine uses javascript +# and DHTML and should work on any modern browser. Note that when using +# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets +# (GENERATE_DOCSET) there is already a search function so this one should +# typically be disabled. For large projects the javascript based search engine +# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. + +SEARCHENGINE = YES + +# When the SERVER_BASED_SEARCH tag is enabled the search engine will be +# implemented using a PHP enabled web server instead of at the web client +# using Javascript. Doxygen will generate the search PHP script and index +# file to put on the web server. The advantage of the server +# based approach is that it scales better to large projects and allows +# full text search. The disadvantages are that it is more difficult to setup +# and does not have live searching capabilities. + +SERVER_BASED_SEARCH = NO + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = NO + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = latex + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. +# Note that when enabling USE_PDFLATEX this option is only used for +# generating bitmaps for formulas in the HTML output, but not in the +# Makefile that is written to the output directory. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = a4 + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for +# the generated latex document. The footer should contain everything after +# the last chapter. If it is left blank doxygen will generate a +# standard footer. Notice: only use this tag if you know what you are doing! + +LATEX_FOOTER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = YES + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = YES + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +# If LATEX_SOURCE_CODE is set to YES then doxygen will include +# source code with syntax highlighting in the LaTeX output. +# Note that which sources are shown also depends on other settings +# such as SOURCE_BROWSER. + +LATEX_SOURCE_CODE = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = rtf + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = man + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = .3 + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. +# This is useful +# if you want to understand what is going on. +# On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# pointed to by INCLUDE_PATH will be searched when a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition that +# overrules the definition found in the source code. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all references to function-like macros +# that are alone on a line, have an all uppercase name, and do not end with a +# semicolon, because these will confuse the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = NO + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = /usr/bin/perl + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option also works with HAVE_DOT disabled, but it is recommended to +# install and use dot, since it yields more powerful graphs. + +CLASS_DIAGRAMS = YES + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = YES + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = YES + +# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is +# allowed to run in parallel. When set to 0 (the default) doxygen will +# base this on the number of processors available in the system. You can set it +# explicitly to a value larger than 0 to get control over the balance +# between CPU load and processing speed. + +DOT_NUM_THREADS = 0 + +# By default doxygen will write a font called Helvetica to the output +# directory and reference it in all dot files that doxygen generates. +# When you want a differently looking font you can specify the font name +# using DOT_FONTNAME. You need to make sure dot is able to find the font, +# which can be done by putting it in a standard location or by setting the +# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory +# containing the font. + +DOT_FONTNAME = Helvetica + +# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. +# The default size is 10pt. + +DOT_FONTSIZE = 10 + +# By default doxygen will tell dot to use the output directory to look for the +# FreeSans.ttf font (which doxygen will put there itself). If you specify a +# different font using DOT_FONTNAME you can set the path where dot +# can find it using this tag. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = NO + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = YES + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = YES + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will generate a graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are svg, png, jpg, or gif. +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MSCFILE_DIRS tag can be used to specify one or more directories that +# contain msc files that are included in the documentation (see the +# \mscfile command). + +MSCFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, because dot on Windows does not +# seem to support this out of the box. Warning: Depending on the platform used, +# enabling this option may lead to badly anti-aliased labels on the edges of +# a graph (i.e. they become hard to read). + +DOT_TRANSPARENT = NO + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES diff --git a/GNUmakefile b/GNUmakefile index 5743ca5962..4d8fe4325a 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -1,256 +1,259 @@ # # Copyright (C) 2008 Andrew Beekhof # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # default: $(shell test ! -e configure && echo initialize) $(shell test -e configure && echo core) -include Makefile PACKAGE ?= pacemaker # Force 'make dist' to be consistent with 'make export' distprefix = ClusterLabs-$(PACKAGE) distdir = $(distprefix)-$(TAG) TARFILE = $(distdir).tar.gz DIST_ARCHIVES = $(TARFILE) RPM_ROOT = $(shell pwd) RPM_OPTS = --define "_sourcedir $(RPM_ROOT)" \ --define "_specdir $(RPM_ROOT)" \ --define "_srcrpmdir $(RPM_ROOT)" \ MOCK_OPTIONS ?= --resultdir=$(RPM_ROOT)/mock --no-cleanup-after # Default to fedora compliant spec files # SLES: /etc/SuSE-release # openSUSE: /etc/SuSE-release # RHEL: /etc/redhat-release # Fedora: /etc/fedora-release, /etc/redhat-release, /etc/system-release F ?= $(shell test ! -e /etc/fedora-release && echo 0; test -e /etc/fedora-release && rpm --eval %{fedora}) ARCH ?= $(shell test -e /etc/fedora-release && rpm --eval %{_arch}) MOCK_CFG ?= $(shell test -e /etc/fedora-release && echo fedora-$(F)-$(ARCH)) DISTRO ?= $(shell test -e /etc/SuSE-release && echo suse; echo fedora) TAG ?= $(shell git log --pretty="format:%h" -n 1) WITH ?= --without=doc #WITH ?= --without=doc --with=gcov LAST_RELEASE ?= $(shell test -e /Volumes || git tag -l | grep Pacemaker | sort -Vr | head -n 1) NEXT_RELEASE ?= $(shell test -e /Volumes || git tag -l | grep Pacemaker | sort -Vr | head -n 1 | awk -F. '/[0-9]+\./{$$3+=1;OFS=".";print $$1,$$2,$$3}') BUILD_COUNTER ?= build.counter LAST_COUNT = $(shell test ! -e $(BUILD_COUNTER) && echo 0; test -e $(BUILD_COUNTER) && cat $(BUILD_COUNTER)) COUNT = $(shell expr 1 + $(LAST_COUNT)) initialize: ./autogen.sh echo "Now run configure with any arguments (eg. --prefix) specific to your system" export: rm -f $(PACKAGE)-dirty.tar.* $(PACKAGE)-tip.tar.* $(PACKAGE)-HEAD.tar.* if [ ! -f $(TARFILE) ]; then \ rm -f $(PACKAGE).tar.*; \ if [ $(TAG) = dirty ]; then \ git commit -m "DO-NOT-PUSH" -a; \ git archive --prefix=$(distdir)/ HEAD | gzip > $(TARFILE); \ git reset --mixed HEAD^; \ else \ git archive --prefix=$(distdir)/ $(TAG) | gzip > $(TARFILE); \ fi; \ echo `date`: Rebuilt $(TARFILE); \ else \ echo `date`: Using existing tarball: $(TARFILE); \ fi $(PACKAGE)-opensuse.spec: $(PACKAGE)-suse.spec cp $^ $@ @echo Rebuilt $@ $(PACKAGE)-suse.spec: $(PACKAGE).spec.in GNUmakefile rm -f $@ cp $(PACKAGE).spec.in $@ sed -i.sed s:%{_docdir}/%{name}:%{_docdir}/%{name}-%{version}:g $@ sed -i.sed s:corosynclib:libcorosync:g $@ sed -i.sed s:libexecdir}/lcrso:libdir}/lcrso:g $@ sed -i.sed 's:%{name}-libs:lib%{name}3:g' $@ sed -i.sed s:heartbeat-libs:heartbeat:g $@ sed -i.sed s:cluster-glue-libs:libglue:g $@ sed -i.sed s:libselinux-devel:automake:g $@ sed -i.sed s:lm_sensors-devel:automake:g $@ sed -i.sed s:bzip2-devel:libbz2-devel:g $@ sed -i.sed s:Development/Libraries:Development/Libraries/C\ and\ C++:g $@ sed -i.sed s:System\ Environment/Daemons:Productivity/Clustering/HA:g $@ sed -i.sed s:bcond_without\ publican:bcond_with\ publican:g $@ sed -i.sed s:\#global\ py_sitedir:\%global\ py_sitedir:g $@ sed -i.sed s:docbook-style-xsl:docbook-xsl-stylesheets:g $@ sed -i.sed s:libtool-ltdl-devel::g $@ sed -i.sed s:publican::g $@ sed -i.sed s:byacc::g $@ sed -i.sed s:without\ cman:with\ cman:g $@ sed -i.sed s:.*pacemaker.service.*::g $@ sed -i.sed s:global\ cs_major.*:global\ cs_major\ 1:g $@ sed -i.sed s:global\ cs_minor.*:global\ cs_minor\ 4:g $@ @echo Rebuilt $@ # Works for all fedora based distros $(PACKAGE)-%.spec: $(PACKAGE).spec.in rm -f $@ cp $(PACKAGE).spec.in $(PACKAGE)-$*.spec @echo Rebuilt $@ srpm-%: export $(PACKAGE)-%.spec rm -f *.src.rpm cp $(PACKAGE)-$*.spec $(PACKAGE).spec if [ -e $(BUILD_COUNTER) ]; then \ echo $(COUNT) > $(BUILD_COUNTER); \ fi sed -i.sed 's/Source0:.*/Source0:\ $(TARFILE)/' $(PACKAGE).spec sed -i.sed 's/global\ specversion.*/global\ specversion\ $(COUNT)/' $(PACKAGE).spec sed -i.sed 's/global\ upstream_version.*/global\ upstream_version\ $(TAG)/' $(PACKAGE).spec sed -i.sed 's/global\ upstream_prefix.*/global\ upstream_prefix\ $(distprefix)/' $(PACKAGE).spec case $(TAG) in \ Pacemaker*) sed -i.sed 's/Version:.*/Version:\ $(shell echo $(TAG) | sed s:Pacemaker-::)/' $(PACKAGE).spec;; \ *) sed -i.sed 's/Version:.*/Version:\ $(shell echo $(NEXT_RELEASE) | sed s:Pacemaker-::)/' $(PACKAGE).spec;; \ esac rpmbuild -bs --define "dist .$*" $(RPM_OPTS) $(WITH) $(PACKAGE).spec chroot: mock-$(MOCK_CFG) mock-install-$(MOCK_CFG) mock-sh-$(MOCK_CFG) echo "Done" mock-next: make F=$(shell expr 1 + $(F)) mock mock-rawhide: make F=rawhide mock mock-install-%: echo "Installing packages" mock --root=$* $(MOCK_OPTIONS) --install $(RPM_ROOT)/mock/*.rpm vi sudo valgrind lcov gdb fence-agents mock-sh-%: echo "Connecting" mock --root=$* $(MOCK_OPTIONS) --shell # eg. WITH="--with cman" make rpm mock-%: make srpm-$(firstword $(shell echo $(@:mock-%=%) | tr '-' ' ')) -rm -rf $(RPM_ROOT)/mock @echo "mock --root=$* --rebuild $(WITH) $(MOCK_OPTIONS) $(RPM_ROOT)/*.src.rpm" mock --root=$* --no-cleanup-after --rebuild $(WITH) $(MOCK_OPTIONS) $(RPM_ROOT)/*.src.rpm srpm: srpm-$(DISTRO) echo "Done" mock: mock-$(MOCK_CFG) echo "Done" rpm: srpm @echo To create custom builds, edit the flags and options in $(PACKAGE).spec first rpmbuild $(RPM_OPTS) $(WITH) --rebuild $(RPM_ROOT)/*.src.rpm dirty: make TAG=dirty mock COVERITY_DIR = $(shell pwd)/coverity-$(TAG) COVHOST ?= coverity.example.com COVPASS ?= password coverity: test -e configure || ./autogen.sh test -e Makefile || ./configure make core-clean rm -rf $(COVERITY_DIR) cov-build --dir $(COVERITY_DIR) make core @echo "Waiting for a Coverity license..." cov-analyze --dir $(COVERITY_DIR) --wait-for-license cov-format-errors --dir $(COVERITY_DIR) --emacs-style > $(TAG).coverity cov-format-errors --dir $(COVERITY_DIR) rsync -avzxlSD --progress $(COVERITY_DIR)/c/output/errors/ root@www.clusterlabs.org:/var/www/html/coverity/$(PACKAGE)/$(TAG) make core-clean # cov-commit-defects --host $(COVHOST) --dir $(COVERITY_DIR) --stream $(PACKAGE) --user auto --password $(COVPASS) rm -rf $(COVERITY_DIR) global: clean-generic gtags -q %.8.html: %.8 echo groff -mandoc `man -w ./$<` -T html > $@ groff -mandoc `man -w ./$<` -T html > $@ rsync -azxlSD --progress $@ root@www.clusterlabs.org:/var/www/html/man/ %.7.html: %.7 echo groff -mandoc `man -w ./$<` -T html > $@ groff -mandoc `man -w ./$<` -T html > $@ rsync -azxlSD --progress $@ root@www.clusterlabs.org:/var/www/html/man/ +doxygen: + doxygen Doxyfile + abi: abi-check pacemaker $(LAST_RELEASE) $(TAG) abi-www: abi-check -u pacemaker $(LAST_RELEASE) $(TAG) -www: global - make all - find . -name "[a-z]*.8" -exec make \{\}.html \; - find . -name "[a-z]*.7" -exec make \{\}.html \; +www: all global doxygen + find . -name "[a-z]*.8" -exec make \{\}.html \; + find . -name "[a-z]*.7" -exec make \{\}.html \; htags -sanhIT rsync -avzxlSD --progress HTML/ root@www.clusterlabs.org:/var/www/html/global/$(PACKAGE)/$(TAG) + rsync -avzxlSD --progress doc/api/html/ root@www.clusterlabs.org:/var/www/html/doxygen/$(PACKAGE)/$(TAG) make -C doc www make coverity summary: @printf "\n* `date +"%a %b %d %Y"` `hg showconfig ui.username` $(NEXT_RELEASE)-1" @printf "\n- Update source tarball to revision: `git id`" @printf "\n- Statistics:\n" @printf " Changesets: `git log --pretty=format:'%h' $(LAST_RELEASE)..HEAD | wc -l`\n" @printf " Diff: " @git diff -r $(LAST_RELEASE)..HEAD --stat | tail -n 1 changes: @printf "\n* `date +"%a %b %d %Y"` `hg showconfig ui.username` $(NEXT_RELEASE)-1" > ChangeLog @printf "\n- Update source tarball to revision: `git id`" >> ChangeLog @printf "\n- Statistics:\n">> ChangeLog @printf " Changesets: `git log --pretty=format:'%h' $(LAST_RELEASE)..HEAD | wc -l`\n" >> ChangeLog @printf " Diff: " >> ChangeLog @git diff -r $(LAST_RELEASE)..HEAD --stat | tail -n 1 >> ChangeLog @printf "\n- Changes since $(LAST_RELEASE)\n" >> ChangeLog @git log --pretty=format:' +%s' --abbrev-commit $(LAST_RELEASE)..HEAD | grep -e High: | sed -e s@High:@@ -e s@PE:@pengine:@ | sort -uf >> ChangeLog @printf "\n">> ChangeLog git show $(LAST_RELEASE):ChangeLog >> ChangeLog @echo -e "\033[1;35m -- Don't forget to run the bumplibs.sh script! --\033[0m" indent: find . -name "*.h" -exec ./p-indent \{\} \; find lib -name "*.c" -exec ./p-indent \{\} \; git co - lib/common/xml.c include/crm/cib_ops.h crmd/fsa_proto.h rel-tags: tags find . -name TAGS -exec sed -i.sed 's:\(.*\)/\(.*\)/TAGS:\2/TAGS:g' \{\} \; ccc_analyzer=/usr/lib64/clang-analyzer/scan-build/ccc-analyzer clang: test -e $(ccc_analyzer) || echo "CLang Analyiser not available. Install the clang-analyzer package" test -e $(ccc_analyzer) || false make CC=$(ccc_analyzer) check # V3 = scandir unsetenv alphasort # V2 = setenv strerror strchrnul strndup # http://www.gnu.org/software/gnulib/manual/html_node/Initial-import.html#Initial-import GNU_MODS = crypto/md5 gnulib-update: -test ! -e gnulib && git clone git://git.savannah.gnu.org/gnulib.git cd gnulib && git pull gnulib/gnulib-tool --source-base=lib/gnu --lgpl=2 --no-vc-files --import $(GNU_MODS) diff --git a/configure.ac b/configure.ac index 9e34d78c5c..9e75f6f812 100644 --- a/configure.ac +++ b/configure.ac @@ -1,1789 +1,1810 @@ dnl dnl autoconf for Pacemaker dnl dnl License: GNU General Public License (GPL) dnl =============================================== dnl Bootstrap dnl =============================================== AC_PREREQ(2.59) dnl Suggested structure: dnl information on the package dnl checks for programs dnl checks for libraries dnl checks for header files dnl checks for types dnl checks for structures dnl checks for compiler characteristics dnl checks for library functions dnl checks for system services AC_INIT(pacemaker, 1.1.8, pacemaker@oss.clusterlabs.org) CRM_DTD_VERSION="1.2" PCMK_FEATURES="" HB_PKG=heartbeat AC_CONFIG_AUX_DIR(.) AC_CANONICAL_HOST dnl Where #defines go (e.g. `AC_CHECK_HEADERS' below) dnl dnl Internal header: include/config.h dnl - Contains ALL defines dnl - include/config.h.in is generated automatically by autoheader dnl - NOT to be included in any header files except lha_internal.h dnl (which is also not to be included in any other header files) dnl dnl External header: include/crm_config.h dnl - Contains a subset of defines checked here dnl - Manually edit include/crm_config.h.in to have configure include dnl new defines dnl - Should not include HAVE_* defines dnl - Safe to include anywhere AM_CONFIG_HEADER(include/config.h include/crm_config.h) ALL_LINGUAS="en fr" AC_ARG_WITH(version, [ --with-version=version Override package version (if you're a packager needing to pretend) ], [ PACKAGE_VERSION="$withval" ]) AC_ARG_WITH(pkg-name, [ --with-pkg-name=name Override package name (if you're a packager needing to pretend) ], [ PACKAGE_NAME="$withval" ]) AM_INIT_AUTOMAKE($PACKAGE_NAME, $PACKAGE_VERSION) AC_DEFINE_UNQUOTED(PACEMAKER_VERSION, "$PACKAGE_VERSION", Current pacemaker version) PACKAGE_SERIES=`echo $PACKAGE_VERSION | awk -F. '{ print $1"."$2 }'` AC_SUBST(PACKAGE_SERIES) AC_SUBST(PACKAGE_VERSION) dnl automake >= 1.11 offers --enable-silent-rules for suppressing the output from dnl normal compilation. When a failure occurs, it will then display the full dnl command line dnl Wrap in m4_ifdef to avoid breaking on older platforms m4_ifdef([AM_SILENT_RULES],[AM_SILENT_RULES([yes])]) dnl Example 2.4. Silent Custom Rule to Generate a File dnl %-bar.pc: %.pc dnl $(AM_V_GEN)$(LN_S) $(notdir $^) $@ CC_IN_CONFIGURE=yes export CC_IN_CONFIGURE LDD=ldd dnl ======================================================================== dnl Compiler characteristics dnl ======================================================================== AC_PROG_CC dnl Can force other with environment variable "CC". AM_PROG_CC_C_O AC_PROG_CC_STDC gl_EARLY gl_INIT AC_LIBTOOL_DLOPEN dnl Enable dlopen support... AC_LIBLTDL_CONVENIENCE dnl make libltdl a convenience lib AC_PROG_LIBTOOL AC_PROG_YACC AM_PROG_LEX AC_C_STRINGIZE AC_TYPE_SIZE_T AC_CHECK_SIZEOF(char) AC_CHECK_SIZEOF(short) AC_CHECK_SIZEOF(int) AC_CHECK_SIZEOF(long) AC_CHECK_SIZEOF(long long) AC_STRUCT_TIMEZONE dnl =============================================== dnl Helpers dnl =============================================== cc_supports_flag() { local CFLAGS="$@" AC_MSG_CHECKING(whether $CC supports "$@") AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ ]], [[ ]])], [RC=0; AC_MSG_RESULT(yes)],[RC=1; AC_MSG_RESULT(no)]) return $RC } try_extract_header_define() { AC_MSG_CHECKING(if $2 in $1 exists. If not defaulting to $3) Cfile=$srcdir/extract_define.$2.${$} printf "#include \n" > ${Cfile}.c printf "#include <%s>\n" $1 >> ${Cfile}.c printf "int main(int argc, char **argv) {\n" >> ${Cfile}.c printf "#ifdef %s\n" $2 >> ${Cfile}.c printf "printf(\"%%s\", %s);\n" $2 >> ${Cfile}.c printf "#endif \n return 0; }\n" >> ${Cfile}.c $CC $CFLAGS ${Cfile}.c -o ${Cfile} value=`${Cfile}` if test x"${value}" == x""; then value=$3 fi AC_MSG_RESULT($value) printf $value rm -rf ${Cfile}.c ${Cfile} ${Cfile}.dSYM ${Cfile}.gcno } extract_header_define() { AC_MSG_CHECKING(for $2 in $1) Cfile=$srcdir/extract_define.$2.${$} printf "#include \n" > ${Cfile}.c printf "#include <%s>\n" $1 >> ${Cfile}.c printf "int main(int argc, char **argv) { printf(\"%%s\", %s); return 0; }\n" $2 >> ${Cfile}.c $CC $CFLAGS ${Cfile}.c -o ${Cfile} value=`${Cfile}` AC_MSG_RESULT($value) printf $value rm -rf ${Cfile}.c ${Cfile} ${Cfile}.dSYM ${Cfile}.gcno } dnl =============================================== dnl Configure Options dnl =============================================== dnl Some systems, like Solaris require a custom package name AC_ARG_WITH(pkgname, [ --with-pkgname=name name for pkg (typically for Solaris) ], [ PKGNAME="$withval" ], [ PKGNAME="LXHAhb" ], ) AC_SUBST(PKGNAME) AC_ARG_ENABLE([ansi], [ --enable-ansi force GCC to compile to ANSI/ANSI standard for older compilers. [default=no]]) AC_ARG_ENABLE([fatal-warnings], [ --enable-fatal-warnings very pedantic and fatal warnings for gcc [default=yes]]) AC_ARG_ENABLE([quiet], [ --enable-quiet Supress make output unless there is an error [default=no]]) AC_ARG_ENABLE([thread-safe], [ --enable-thread-safe Enable some client libraries to be thread safe. [default=no]]) AC_ARG_ENABLE([bundled-ltdl], [ --enable-bundled-ltdl Configure, build and install the standalone ltdl library bundled with ${PACKAGE} [default=no]]) LTDL_LIBS="" AC_ARG_ENABLE([no-stack], [ --enable-no-stack Only build the Policy Engine and pieces needed to support it [default=no]]) AC_ARG_ENABLE([upstart], [ --enable-upstart Do not build support for the Upstart init system [default=yes]]) AC_ARG_ENABLE([systemd], [ --enable-systemd Do not build support for the Systemd init system [default=yes]]) AC_ARG_WITH(ais, [ --with-ais Support the Corosync messaging and membership layer ], [ SUPPORT_CS=$withval ], [ SUPPORT_CS=try ], ) AC_ARG_WITH(corosync, [ --with-corosync Support the Corosync messaging and membership layer ], [ SUPPORT_CS=$withval ] dnl initialized in AC_ARG_WITH(ais...) already, dnl don't reset to try if it was given as --without-ais ) AC_ARG_WITH(heartbeat, [ --with-heartbeat Support the Heartbeat messaging and membership layer ], [ SUPPORT_HEARTBEAT=$withval ], [ SUPPORT_HEARTBEAT=try ], ) AC_ARG_WITH(cman, [ --with-cman Support the consumption of membership and quorum from cman ], [ SUPPORT_CMAN=$withval ], [ SUPPORT_CMAN=try ], ) AC_ARG_WITH(cpg, [ --with-cs-quorum Support the consumption of membership and quorum from corosync ], [ SUPPORT_CS_QUORUM=$withval ], [ SUPPORT_CS_QUORUM=try ], ) AC_ARG_WITH(snmp, [ --with-snmp Support the SNMP protocol ], [ SUPPORT_SNMP=$withval ], [ SUPPORT_SNMP=try ], ) AC_ARG_WITH(esmtp, [ --with-esmtp Support the sending mail notifications with the esmtp library ], [ SUPPORT_ESMTP=$withval ], [ SUPPORT_ESMTP=try ], ) AC_ARG_WITH(acl, [ --with-acl Support CIB ACL ], [ SUPPORT_ACL=$withval ], [ SUPPORT_ACL=no ], ) CSPREFIX="" AC_ARG_WITH(ais-prefix, [ --with-ais-prefix=DIR Prefix used when Corosync was installed [$prefix]], [ CSPREFIX=$withval ], [ CSPREFIX=$prefix ]) LCRSODIR="" AC_ARG_WITH(lcrso-dir, [ --with-lcrso-dir=DIR Corosync lcrso files. ], [ LCRSODIR="$withval" ]) INITDIR="" AC_ARG_WITH(initdir, [ --with-initdir=DIR directory for init (rc) scripts [${INITDIR}]], [ INITDIR="$withval" ]) SUPPORT_PROFILING=0 AC_ARG_WITH(profiling, [ --with-profiling Support gprof profiling ], [ SUPPORT_PROFILING=$withval ]) SUPPORT_GCOV=0 AC_ARG_WITH(gcov, [ --with-gcov Support gcov coverage testing ], [ SUPPORT_GCOV=$withval ]) PUBLICAN_BRAND="common" AC_ARG_WITH(brand, [ --with-brand=brand Brand to use for generated documentation [$PUBLICAN_BRAND]], [ PUBLICAN_BRAND="$withval" ]) AC_SUBST(PUBLICAN_BRAND) ASCIIDOC_CLI_TYPE="pcs" AC_ARG_WITH(doc-cli, [ --with-doc-cli=cli_type CLI type to use for generated documentation. [$ASCIIDOC_CLI_TYPE]], [ ASCIIDOC_CLI_TYPE="$withval" ]) AC_SUBST(ASCIIDOC_CLI_TYPE) dnl =============================================== dnl General Processing dnl =============================================== AC_SUBST(HB_PKG) INIT_EXT="" echo Our Host OS: $host_os/$host AC_MSG_NOTICE(Sanitizing prefix: ${prefix}) case $prefix in NONE) prefix=/usr dnl Fix default variables - "prefix" variable if not specified if test "$localstatedir" = "\${prefix}/var"; then localstatedir="/var" fi if test "$sysconfdir" = "\${prefix}/etc"; then sysconfdir="/etc" fi ;; esac AC_MSG_NOTICE(Sanitizing exec_prefix: ${exec_prefix}) case $exec_prefix in dnl For consistency with Heartbeat, map NONE->$prefix NONE) exec_prefix=$prefix;; prefix) exec_prefix=$prefix;; esac AC_MSG_NOTICE(Sanitizing ais_prefix: ${CSPREFIX}) case $CSPREFIX in dnl For consistency with Heartbeat, map NONE->$prefix NONE) CSPREFIX=$prefix;; prefix) CSPREFIX=$prefix;; esac AC_MSG_NOTICE(Sanitizing INITDIR: ${INITDIR}) case $INITDIR in prefix) INITDIR=$prefix;; "") AC_MSG_CHECKING(which init (rc) directory to use) for initdir in /etc/init.d /etc/rc.d/init.d /sbin/init.d \ /usr/local/etc/rc.d /etc/rc.d do if test -d $initdir then INITDIR=$initdir break fi done AC_MSG_RESULT($INITDIR);; esac AC_SUBST(INITDIR) AC_MSG_NOTICE(Sanitizing libdir: ${libdir}) case $libdir in dnl For consistency with Heartbeat, map NONE->$prefix *prefix*|NONE) AC_MSG_CHECKING(which lib directory to use) for aDir in lib64 lib do trydir="${exec_prefix}/${aDir}" if test -d ${trydir} then libdir=${trydir} break fi done AC_MSG_RESULT($libdir); ;; esac dnl Expand autoconf variables so that we dont end up with '${prefix}' dnl in #defines and python scripts dnl NOTE: Autoconf deliberately leaves them unexpanded to allow dnl make exec_prefix=/foo install dnl No longer being able to do this seems like no great loss to me... eval prefix="`eval echo ${prefix}`" eval exec_prefix="`eval echo ${exec_prefix}`" eval bindir="`eval echo ${bindir}`" eval sbindir="`eval echo ${sbindir}`" eval libexecdir="`eval echo ${libexecdir}`" eval datadir="`eval echo ${datadir}`" eval sysconfdir="`eval echo ${sysconfdir}`" eval sharedstatedir="`eval echo ${sharedstatedir}`" eval localstatedir="`eval echo ${localstatedir}`" eval libdir="`eval echo ${libdir}`" eval includedir="`eval echo ${includedir}`" eval oldincludedir="`eval echo ${oldincludedir}`" eval infodir="`eval echo ${infodir}`" eval mandir="`eval echo ${mandir}`" dnl Home-grown variables eval INITDIR="${INITDIR}" eval docdir="`eval echo ${docdir}`" if test x"${docdir}" = x""; then docdir=${datadir}/doc/${PACKAGE}-${VERSION} #docdir=${datadir}/doc/packages/${PACKAGE} fi AC_SUBST(docdir) for j in prefix exec_prefix bindir sbindir libexecdir datadir sysconfdir \ sharedstatedir localstatedir libdir includedir oldincludedir infodir \ mandir INITDIR docdir do dirname=`eval echo '${'${j}'}'` if test ! -d "$dirname" then AC_MSG_WARN([$j directory ($dirname) does not exist!]) fi done dnl This OS-based decision-making is poor autotools practice; dnl feature-based mechanisms are strongly preferred. dnl dnl So keep this section to a bare minimum; regard as a "necessary evil". case "$host_os" in *bsd*) LIBS="-L/usr/local/lib" CPPFLAGS="$CPPFLAGS -I/usr/local/include" INIT_EXT=".sh" ;; *solaris*) ;; *linux*) AC_DEFINE_UNQUOTED(ON_LINUX, 1, Compiling for Linux platform) CFLAGS="$CFLAGS -I${prefix}/include" ;; darwin*) AC_DEFINE_UNQUOTED(ON_DARWIN, 1, Compiling for Darwin platform) LIBS="$LIBS -L${prefix}/lib" CFLAGS="$CFLAGS -I${prefix}/include" ;; esac dnl Eventually remove this CFLAGS="$CFLAGS -I${prefix}/include/heartbeat" AC_SUBST(INIT_EXT) AC_MSG_NOTICE(Host CPU: $host_cpu) case "$host_cpu" in ppc64|powerpc64) case $CFLAGS in *powerpc64*) ;; *) if test "$GCC" = yes; then CFLAGS="$CFLAGS -m64" fi ;; esac esac AC_MSG_CHECKING(which format is needed to print uint64_t) ac_save_CFLAGS=$CFLAGS CFLAGS="-Wall -Werror" AC_COMPILE_IFELSE( [AC_LANG_PROGRAM( [ #include #include #include ], [ int max = 512; uint64_t bignum = 42; char *buffer = malloc(max); const char *random = "random"; snprintf(buffer, max-1, "", bignum, random); fprintf(stderr, "Result: %s\n", buffer); ] )], [U64T="%lu"], [U64T="%llu"] ) CFLAGS=$ac_save_CFLAGS AC_MSG_RESULT($U64T) AC_DEFINE_UNQUOTED(U64T, "$U64T", Correct printf format for logging uint64_t) dnl =============================================== dnl Program Paths dnl =============================================== PATH="$PATH:/sbin:/usr/sbin:/usr/local/sbin:/usr/local/bin" export PATH dnl Replacing AC_PROG_LIBTOOL with AC_CHECK_PROG because LIBTOOL dnl was NOT being expanded all the time thus causing things to fail. AC_CHECK_PROGS(LIBTOOL, glibtool libtool libtool15 libtool13) AM_PATH_PYTHON AC_CHECK_PROGS(MAKE, gmake make) AC_PATH_PROGS(HTML2TXT, lynx w3m) AC_PATH_PROGS(HELP2MAN, help2man) AC_PATH_PROGS(POD2MAN, pod2man, pod2man) AC_PATH_PROGS(ASCIIDOC, asciidoc) AC_PATH_PROGS(PUBLICAN, publican) AC_PATH_PROGS(INKSCAPE, inkscape) AC_PATH_PROGS(XSLTPROC, xsltproc) AC_PATH_PROGS(FOP, fop) AC_PATH_PROGS(SSH, ssh, /usr/bin/ssh) AC_PATH_PROGS(SCP, scp, /usr/bin/scp) AC_PATH_PROGS(TAR, tar) AC_PATH_PROGS(MD5, md5) AC_PATH_PROGS(TEST, test) AC_PATH_PROGS(PKGCONFIG, pkg-config) AC_PATH_PROGS(XML2CONFIG, xml2-config) AC_PATH_PROGS(VALGRIND_BIN, valgrind, /usr/bin/valgrind) AC_DEFINE_UNQUOTED(VALGRIND_BIN, "$VALGRIND_BIN", Valgrind command) dnl Disable these until we decide if the stonith config file should be supported dnl AC_PATH_PROGS(BISON, bison) dnl AC_PATH_PROGS(FLEX, flex) dnl AC_PATH_PROGS(HAVE_YACC, $YACC) if test x"${LIBTOOL}" = x""; then AC_MSG_ERROR(You need (g)libtool installed in order to build ${PACKAGE}) fi if test x"${MAKE}" = x""; then AC_MSG_ERROR(You need (g)make installed in order to build ${PACKAGE}) fi AM_CONDITIONAL(BUILD_HELP, test x"${HELP2MAN}" != x"") if test x"${HELP2MAN}" != x""; then PCMK_FEATURES="$PCMK_FEATURES generated-manpages" fi MANPAGE_XSLT="" if test x"${XSLTPROC}" != x""; then AC_MSG_CHECKING(docbook to manpage transform) XSLT=`find ${datadir} -name docbook.xsl` for xsl in $XSLT; do dname=`dirname $xsl` bname=`basename $dname` if test "$bname" = "manpages"; then MANPAGE_XSLT="$xsl" break fi done fi AC_MSG_RESULT($MANPAGE_XSLT) AC_SUBST(MANPAGE_XSLT) AM_CONDITIONAL(BUILD_XML_HELP, test x"${MANPAGE_XSLT}" != x"") if test x"${MANPAGE_XSLT}" != x""; then PCMK_FEATURES="$PCMK_FEATURES agent-manpages" fi AM_CONDITIONAL(BUILD_ASCIIDOC, test x"${ASCIIDOC}" != x"") if test x"${ASCIIDOC}" != x""; then PCMK_FEATURES="$PCMK_FEATURES ascii-docs" fi SUPPORT_STONITH_CONFIG=0 if test x"${HAVE_YACC}" != x"" -a x"${FLEX}" != x"" -a x"${BISON}" != x""; then SUPPORT_STONITH_CONFIG=1 PCMK_FEATURES="$PCMK_FEATURES st-conf" fi AM_CONDITIONAL(BUILD_STONITH_CONFIG, test $SUPPORT_STONITH_CONFIG = 1) AC_DEFINE_UNQUOTED(SUPPORT_STONITH_CONFIG, $SUPPORT_STONITH_CONFIG, Support a stand-alone stonith config file in addition to the CIB) AM_CONDITIONAL(BUILD_DOCBOOK, test x"${PUBLICAN}" != x"" -a x"${INKSCAPE}" != x"") if test x"${PUBLICAN}" != x"" -a x"${INKSCAPE}" != x""; then AC_MSG_NOTICE(Enabling publican) PCMK_FEATURES="$PCMK_FEATURES publican-docs" fi dnl ======================================================================== dnl checks for library functions to replace them dnl dnl NoSuchFunctionName: dnl is a dummy function which no system supplies. It is here to make dnl the system compile semi-correctly on OpenBSD which doesn't know dnl how to create an empty archive dnl dnl scandir: Only on BSD. dnl System-V systems may have it, but hidden and/or deprecated. dnl A replacement function is supplied for it. dnl dnl setenv: is some bsdish function that should also be avoided (use dnl putenv instead) dnl On the other hand, putenv doesn't provide the right API for the dnl code and has memory leaks designed in (sigh...) Fortunately this dnl A replacement function is supplied for it. dnl dnl strerror: returns a string that corresponds to an errno. dnl A replacement function is supplied for it. dnl dnl strnlen: is a gnu function similar to strlen, but safer. dnl We wrote a tolearably-fast replacement function for it. dnl dnl strndup: is a gnu function similar to strdup, but safer. dnl We wrote a tolearably-fast replacement function for it. AC_REPLACE_FUNCS(alphasort NoSuchFunctionName scandir setenv strerror strchrnul unsetenv strnlen strndup) dnl =============================================== dnl Libraries dnl =============================================== AC_CHECK_LIB(socket, socket) dnl -lsocket AC_CHECK_LIB(c, dlopen) dnl if dlopen is in libc... AC_CHECK_LIB(dl, dlopen) dnl -ldl (for Linux) AC_CHECK_LIB(rt, sched_getscheduler) dnl -lrt (for Tru64) AC_CHECK_LIB(gnugetopt, getopt_long) dnl -lgnugetopt ( if available ) AC_CHECK_LIB(pam, pam_start) dnl -lpam (if available) AC_CHECK_FUNCS([sched_getparam sched_setparam sched_get_priority_min]) AC_CHECK_LIB(uuid, uuid_parse) dnl load the library if necessary AC_CHECK_FUNCS(uuid_unparse) dnl OSX ships uuid_* as standard functions AC_CHECK_HEADERS(uuid/uuid.h) if test "x$ac_cv_func_uuid_unparse" != xyes; then AC_MSG_ERROR(You do not have the libuuid development package installed) fi if test x"${PKGCONFIG}" = x""; then AC_MSG_ERROR(You need pkgconfig installed in order to build ${PACKAGE}) fi if test "x${enable_thread_safe}" = "xyes"; then GPKGNAME="gthread-2.0" else GPKGNAME="glib-2.0" fi if $PKGCONFIG --exists $GPKGNAME then GLIBCONFIG="$PKGCONFIG $GPKGNAME" else set -x echo PKG_CONFIG_PATH=$PKG_CONFIG_PATH $PKGCONFIG --exists $GPKGNAME; echo $? $PKGCONFIG --cflags $GPKGNAME; echo $? $PKGCONFIG $GPKGNAME; echo $? set +x AC_MSG_ERROR(You need glib2-devel installed in order to build ${PACKAGE}) fi AC_MSG_RESULT(using $GLIBCONFIG) AC_CHECK_LIB(glib-2.0, g_hash_table_get_values) if test "x$ac_cv_lib_glib_2_0_g_hash_table_get_values" != x""yes; then AC_DEFINE_UNQUOTED(NEED_G_HASH_ITER, 1, glib-2.0 has no hashtable iterators) fi AC_CHECK_LIB(glib-2.0, g_list_free_full) if test "x$ac_cv_lib_glib_2_0_g_list_free_full" != x""yes; then AC_DEFINE_UNQUOTED(NEED_G_LIST_FREE_FULL, 1, glib-2.0 has no g_list_free_full) fi if $PKGCONFIG --exists systemd then systemdunitdir=`$PKGCONFIG --variable=systemdsystemunitdir systemd` AC_SUBST(systemdunitdir) fi AM_CONDITIONAL(HAVE_SYSTEMD, test -n "$systemdunitdir" -a "x$systemdunitdir" != xno) # # Where is dlopen? # if test "$ac_cv_lib_c_dlopen" = yes; then LIBADD_DL="" elif test "$ac_cv_lib_dl_dlopen" = yes; then LIBADD_DL=-ldl else LIBADD_DL=${lt_cv_dlopen_libs} fi dnl dnl Check for location of gettext dnl dnl On at least Solaris 2.x, where it is in libc, specifying lintl causes dnl grief. Ensure minimal result, not the sum of all possibilities. dnl And do libc first. dnl Known examples: dnl c: Linux, Solaris 2.6+ dnl intl: BSD, AIX AC_CHECK_LIB(c, gettext) if test x$ac_cv_lib_c_gettext != xyes; then AC_CHECK_LIB(intl, gettext) fi if test x$ac_cv_lib_c_gettext != xyes -a x$ac_cv_lib_intl_gettext != xyes; then AC_MSG_ERROR(You need gettext installed in order to build ${PACKAGE}) fi if test "X$GLIBCONFIG" != X; then AC_MSG_CHECKING(for special glib includes: ) GLIBHEAD=`$GLIBCONFIG --cflags` AC_MSG_RESULT($GLIBHEAD) CPPFLAGS="$CPPFLAGS $GLIBHEAD" AC_MSG_CHECKING(for glib library flags) GLIBLIB=`$GLIBCONFIG --libs` AC_MSG_RESULT($GLIBLIB) LIBS="$LIBS $GLIBLIB" fi dnl ======================================================================== dnl Headers dnl ======================================================================== AC_HEADER_STDC AC_CHECK_HEADERS(arpa/inet.h) AC_CHECK_HEADERS(asm/types.h) AC_CHECK_HEADERS(assert.h) AC_CHECK_HEADERS(auth-client.h) AC_CHECK_HEADERS(ctype.h) AC_CHECK_HEADERS(dirent.h) AC_CHECK_HEADERS(errno.h) AC_CHECK_HEADERS(fcntl.h) AC_CHECK_HEADERS(getopt.h) AC_CHECK_HEADERS(glib.h) AC_CHECK_HEADERS(grp.h) AC_CHECK_HEADERS(limits.h) AC_CHECK_HEADERS(linux/errqueue.h) AC_CHECK_HEADERS(malloc.h) AC_CHECK_HEADERS(netdb.h) AC_CHECK_HEADERS(netinet/in.h) AC_CHECK_HEADERS(netinet/ip.h) AC_CHECK_HEADERS(pam/pam_appl.h) AC_CHECK_HEADERS(pthread.h) AC_CHECK_HEADERS(pwd.h) AC_CHECK_HEADERS(security/pam_appl.h) AC_CHECK_HEADERS(sgtty.h) AC_CHECK_HEADERS(signal.h) AC_CHECK_HEADERS(stdarg.h) AC_CHECK_HEADERS(stddef.h) AC_CHECK_HEADERS(stdio.h) AC_CHECK_HEADERS(stdlib.h) AC_CHECK_HEADERS(string.h) AC_CHECK_HEADERS(strings.h) AC_CHECK_HEADERS(sys/dir.h) AC_CHECK_HEADERS(sys/ioctl.h) AC_CHECK_HEADERS(sys/param.h) AC_CHECK_HEADERS(sys/poll.h) AC_CHECK_HEADERS(sys/resource.h) AC_CHECK_HEADERS(sys/select.h) AC_CHECK_HEADERS(sys/socket.h) AC_CHECK_HEADERS(sys/sockio.h) AC_CHECK_HEADERS(sys/stat.h) AC_CHECK_HEADERS(sys/time.h) AC_CHECK_HEADERS(sys/timeb.h) AC_CHECK_HEADERS(sys/types.h) AC_CHECK_HEADERS(sys/uio.h) AC_CHECK_HEADERS(sys/un.h) AC_CHECK_HEADERS(sys/utsname.h) AC_CHECK_HEADERS(sys/wait.h) AC_CHECK_HEADERS(time.h) AC_CHECK_HEADERS(unistd.h) AC_CHECK_HEADERS(winsock.h) dnl These headers need prerequisits before the tests will pass dnl AC_CHECK_HEADERS(net/if.h) dnl AC_CHECK_HEADERS(netinet/icmp6.h) dnl AC_CHECK_HEADERS(netinet/ip6.h) dnl AC_CHECK_HEADERS(netinet/ip_icmp.h) AC_MSG_CHECKING(for special libxml2 includes) if test "x$XML2CONFIG" = "x"; then AC_MSG_ERROR(libxml2 config not found) else XML2HEAD="`$XML2CONFIG --cflags`" AC_MSG_RESULT($XML2HEAD) AC_CHECK_LIB(xml2, xmlReadMemory) AC_CHECK_LIB(xslt, xsltApplyStylesheet) fi CPPFLAGS="$CPPFLAGS $XML2HEAD" AC_CHECK_HEADERS(libxml/xpath.h) AC_CHECK_HEADERS(libxslt/xslt.h) if test "$ac_cv_header_libxml_xpath_h" != "yes"; then AC_MSG_ERROR(The libxml developement headers were not found) fi if test "$ac_cv_header_libxslt_xslt_h" != "yes"; then AC_MSG_ERROR(The libxslt developement headers were not found) fi dnl ======================================================================== dnl Structures dnl ======================================================================== AC_CHECK_MEMBERS([struct tm.tm_gmtoff],,,[[#include ]]) AC_CHECK_MEMBERS([lrm_op_t.rsc_deleted],,,[[#include ]]) dnl ======================================================================== dnl Functions dnl ======================================================================== AC_CHECK_FUNCS(g_log_set_default_handler) AC_CHECK_FUNCS(getopt, AC_DEFINE(HAVE_DECL_GETOPT, 1, [Have getopt function])) AC_CHECK_FUNCS(nanosleep, AC_DEFINE(HAVE_DECL_NANOSLEEP, 1, [Have nanosleep function])) dnl ======================================================================== dnl ltdl dnl ======================================================================== AC_CHECK_LIB(ltdl, lt_dlopen, [LTDL_foo=1]) if test "x${enable_bundled_ltdl}" = "xyes"; then if test $ac_cv_lib_ltdl_lt_dlopen = yes; then AC_MSG_NOTICE([Disabling usage of installed ltdl]) fi ac_cv_lib_ltdl_lt_dlopen=no fi LIBLTDL_DIR="" if test $ac_cv_lib_ltdl_lt_dlopen != yes ; then AC_MSG_NOTICE([Installing local ltdl]) LIBLTDL_DIR=libltdl ( cd $srcdir ; $TAR -xvf libltdl.tar ) if test "$?" -ne 0; then AC_MSG_ERROR([$TAR of libltdl.tar in $srcdir failed]) fi AC_CONFIG_SUBDIRS(libltdl) else LIBS="$LIBS -lltdl" AC_MSG_NOTICE([Using installed ltdl]) INCLTDL="" LIBLTDL="" fi AC_SUBST(INCLTDL) AC_SUBST(LIBLTDL) AC_SUBST(LIBLTDL_DIR) dnl ======================================================================== dnl bzip2 dnl ======================================================================== AC_CHECK_HEADERS(bzlib.h) AC_CHECK_LIB(bz2, BZ2_bzBuffToBuffCompress) if test x$ac_cv_lib_bz2_BZ2_bzBuffToBuffCompress != xyes ; then AC_MSG_ERROR(BZ2 libraries not found) fi if test x$ac_cv_header_bzlib_h != xyes; then AC_MSG_ERROR(BZ2 Development headers not found) fi +dnl ======================================================================== +dnl sighandler_t is missing from Illumos, Solaris11 systems +dnl ======================================================================== + +AC_MSG_CHECKING([for sighandler_t]) +AC_TRY_COMPILE([#include ],[sighandler_t *f;], +has_sighandler_t=yes,has_sighandler_t=no) +AC_MSG_RESULT($has_sighandler_t) +if test "$has_sighandler_t" = "yes" ; then + AC_DEFINE( HAVE_SIGHANDLER_T, 1, [Define if sighandler_t available] ) +fi dnl ======================================================================== dnl ncurses dnl ======================================================================== dnl dnl A few OSes (e.g. Linux) deliver a default "ncurses" alongside "curses". dnl Many non-Linux deliver "curses"; sites may add "ncurses". dnl dnl However, the source-code recommendation for both is to #include "curses.h" dnl (i.e. "ncurses" still wants the include to be simple, no-'n', "curses.h"). dnl dnl ncurse takes precedence. dnl AC_CHECK_HEADERS(curses.h) AC_CHECK_HEADERS(curses/curses.h) AC_CHECK_HEADERS(ncurses.h) AC_CHECK_HEADERS(ncurses/ncurses.h) dnl Although n-library is preferred, only look for it if the n-header was found. CURSESLIBS='' if test "$ac_cv_header_ncurses_h" = "yes"; then AC_CHECK_LIB(ncurses, printw, [CURSESLIBS='-lncurses'; AC_DEFINE(HAVE_LIBNCURSES,1, have ncurses library)] ) fi if test "$ac_cv_header_ncurses_ncurses_h" = "yes"; then AC_CHECK_LIB(ncurses, printw, [CURSESLIBS='-lncurses'; AC_DEFINE(HAVE_LIBNCURSES,1, have ncurses library)] ) fi dnl Only look for non-n-library if there was no n-library. if test X"$CURSESLIBS" = X"" -a "$ac_cv_header_curses_h" = "yes"; then AC_CHECK_LIB(curses, printw, [CURSESLIBS='-lcurses'; AC_DEFINE(HAVE_LIBCURSES,1, have curses library)] ) fi dnl Only look for non-n-library if there was no n-library. if test X"$CURSESLIBS" = X"" -a "$ac_cv_header_curses_curses_h" = "yes"; then AC_CHECK_LIB(curses, printw, [CURSESLIBS='-lcurses'; AC_DEFINE(HAVE_LIBCURSES,1, have curses library)] ) fi if test "x$CURSESLIBS" != "x"; then PCMK_FEATURES="$PCMK_FEATURES ncurses" fi dnl Check for printw() prototype compatibility if test X"$CURSESLIBS" != X"" && cc_supports_flag -Wcast-qual && cc_supports_flag -Werror; then AC_MSG_CHECKING(whether printw() requires argument of "const char *") ac_save_LIBS=$LIBS LIBS="$CURSESLIBS $LIBS" ac_save_CFLAGS=$CFLAGS CFLAGS="-Wcast-qual -Werror" AC_LINK_IFELSE( [AC_LANG_PROGRAM( [ -#if defined(HAVE_CURSES_H) -# include -#elif defined(HAVE_NCURSES_H) +#if defined(HAVE_NCURSES_H) # include +#elif defined(HAVE_NCURSES_NCURSES_H) +# include +#elif defined(HAVE_CURSES_H) +# include #endif ], [printw((const char *)"Test");] )], [ac_cv_compatible_printw=yes], [ac_cv_compatible_printw=no] ) LIBS=$ac_save_LIBS CFLAGS=$ac_save_CFLAGS AC_MSG_RESULT([$ac_cv_compatible_printw]) if test "$ac_cv_compatible_printw" = no; then AC_MSG_WARN([The printw() function of your ncurses or curses library is old, we will disable usage of the library. If you want to use this library anyway, please update to newer version of the library, ncurses 5.4 or later is recommended. You can get the library from http://www.gnu.org/software/ncurses/.]) AC_MSG_NOTICE([Disabling curses]) AC_DEFINE(HAVE_INCOMPATIBLE_PRINTW, 1, [Do we have incompatible printw() in curses library?]) fi fi AC_SUBST(CURSESLIBS) dnl ======================================================================== dnl Profiling and GProf dnl ======================================================================== case $SUPPORT_PROFILING in 1|yes|true) SUPPORT_PROFILING=1 dnl Enable gprof #LIBS="$LIBS -pg" #CFLAGS="$CFLAGS -pg" dnl Disable various compiler optimizations CFLAGS="$CFLAGS -fno-omit-frame-pointer" #CFLAGS="$CFLAGS -fno-inline-functions -fno-inline-functions-called-once -fno-optimize-sibling-calls" dnl CFLAGS="$CFLAGS -fno-default-inline -fno-inline" dnl Update features PCMK_FEATURES="$PCMK_FEATURES gprof" ;; *) SUPPORT_PROFILING=0;; esac AC_DEFINE_UNQUOTED(SUPPORT_PROFILING, $SUPPORT_PROFILING, Support for gprof profiling) case $SUPPORT_GCOV in 1|yes|true) SUPPORT_GCOV=1 dnl Enable gprof #LIBS="$LIBS -pg" #CFLAGS="$CFLAGS -pg" dnl Disable various compiler optimizations CFLAGS="$CFLAGS -fprofile-arcs -ftest-coverage -fno-inline" dnl Turn off optimization so code coverage tool dnl can get accurate line numbers AC_MSG_NOTICE(Old CFLAGS: $CFLAGS) CFLAGS=`echo $CFLAGS | sed -e 's/-O.\ //g' -e 's/-Wp,-D_FORTIFY_SOURCE=.\ //g'` CFLAGS="$CFLAGS -O0" AC_MSG_NOTICE(New CFLAGS: $CFLAGS) dnl Update features PCMK_FEATURES="$PCMK_FEATURES gcov" ;; *) SUPPORT_PROFILING=0;; esac AC_DEFINE_UNQUOTED(SUPPORT_GCOV, $SUPPORT_GCOV, Support for gcov coverage testing) dnl ======================================================================== dnl Cluster infrastructure - Heartbeat / LibQB dnl ======================================================================== dnl Compatability checks AC_CHECK_MEMBERS([struct lrm_ops.fail_rsc],,,[[#include ]]) if test x${enable_no_stack} = xyes; then SUPPORT_HEARTBEAT=no SUPPORT_CS=no fi PKG_CHECK_MODULES(libqb, libqb, HAVE_libqb=1, HAVE_libqb=0) AC_CHECK_HEADERS(qb/qbipc_common.h) AC_CHECK_LIB(qb, qb_ipcc_is_connected) AC_CHECK_FUNCS(qb_ipcc_is_connected) LIBQB_LOG=1 PCMK_FEATURES="$PCMK_FEATURES libqb-logging libqb-ipc" if test $ac_cv_lib_qb_qb_ipcc_is_connected != yes; then AC_MSG_FAILURE(Version of IPC in libqb is not new enough) fi AC_DEFINE_UNQUOTED(LIBQB_LOGGING, $LIBQB_LOG, Use libqb for logging) AC_DEFINE_UNQUOTED(LIBQB_IPC, 0, Use libqb for IPC) LIBS="$LIBS $libqb_LIBS" AC_CHECK_HEADERS(heartbeat/hb_config.h) AC_CHECK_HEADERS(heartbeat/glue_config.h) AC_CHECK_HEADERS(stonith/stonith.h) AC_CHECK_HEADERS(agent_config.h) GLUE_HEADER=none HAVE_GLUE=0 if test "$ac_cv_header_heartbeat_glue_config_h" = "yes"; then GLUE_HEADER=glue_config.h HAVE_GLUE=1 elif test "$ac_cv_header_heartbeat_hb_config_h" = "yes"; then GLUE_HEADER=hb_config.h HAVE_GLUE=1 else AC_MSG_WARN(cluster-glue development headers were not found) fi if test "$ac_cv_header_stonith_stonith_h" = "yes"; then PCMK_FEATURES="$PCMK_FEATURES lha-fencing" fi if test $HAVE_GLUE = 1; then dnl On Debian, AC_CHECK_LIBS fail if a library has any unresolved symbols dnl So check for all the depenancies (so they're added to LIBS) before checking for -lplumb AC_CHECK_LIB(pils, PILLoadPlugin) AC_CHECK_LIB(plumb, G_main_add_IPC_Channel) fi dnl =============================================== dnl Variables needed for substitution dnl =============================================== CRM_DTD_DIRECTORY="${datadir}/pacemaker" AC_DEFINE_UNQUOTED(CRM_DTD_DIRECTORY,"$CRM_DTD_DIRECTORY", Location for the Pacemaker Relax-NG Schema) AC_SUBST(CRM_DTD_DIRECTORY) AC_DEFINE_UNQUOTED(CRM_DTD_VERSION,"$CRM_DTD_VERSION", Current version of the Pacemaker Relax-NG Schema) AC_SUBST(CRM_DTD_VERSION) CRM_CORE_DIR=`try_extract_header_define $GLUE_HEADER HA_COREDIR ${localstatedir}/lib/pacemaker/cores` AC_DEFINE_UNQUOTED(CRM_CORE_DIR,"$CRM_CORE_DIR", Location to store core files produced by Pacemaker daemons) AC_SUBST(CRM_CORE_DIR) CRM_DAEMON_USER=`try_extract_header_define $GLUE_HEADER HA_CCMUSER hacluster` AC_DEFINE_UNQUOTED(CRM_DAEMON_USER,"$CRM_DAEMON_USER", User to run Pacemaker daemons as) AC_SUBST(CRM_DAEMON_USER) CRM_DAEMON_GROUP=`try_extract_header_define $GLUE_HEADER HA_APIGROUP haclient` AC_DEFINE_UNQUOTED(CRM_DAEMON_GROUP,"$CRM_DAEMON_GROUP", Group to run Pacemaker daemons as) AC_SUBST(CRM_DAEMON_GROUP) CRM_STATE_DIR=${localstatedir}/run/crm AC_DEFINE_UNQUOTED(CRM_STATE_DIR,"$CRM_STATE_DIR", Where to keep state files and sockets) AC_SUBST(CRM_STATE_DIR) CRM_BLACKBOX_DIR=${localstatedir}/lib/pacemaker/blackbox AC_DEFINE_UNQUOTED(CRM_BLACKBOX_DIR,"$CRM_BLACKBOX_DIR", Where to keep blackbox dumps) AC_SUBST(CRM_BLACKBOX_DIR) PE_STATE_DIR="${localstatedir}/lib/pacemaker/pengine" AC_DEFINE_UNQUOTED(PE_STATE_DIR,"$PE_STATE_DIR", Where to keep PEngine outputs) AC_SUBST(PE_STATE_DIR) CRM_CONFIG_DIR="${localstatedir}/lib/pacemaker/cib" AC_DEFINE_UNQUOTED(CRM_CONFIG_DIR,"$CRM_CONFIG_DIR", Where to keep configuration files) AC_SUBST(CRM_CONFIG_DIR) CRM_LEGACY_CONFIG_DIR="${localstatedir}/lib/heartbeat/crm" AC_DEFINE_UNQUOTED(CRM_LEGACY_CONFIG_DIR,"$CRM_LEGACY_CONFIG_DIR", Where Pacemaker used to keep configuration files) AC_SUBST(CRM_LEGACY_CONFIG_DIR) CRM_DAEMON_DIR="${libexecdir}/pacemaker" AC_DEFINE_UNQUOTED(CRM_DAEMON_DIR,"$CRM_DAEMON_DIR", Location for Pacemaker daemons) AC_SUBST(CRM_DAEMON_DIR) HB_DAEMON_DIR=`try_extract_header_define $GLUE_HEADER HA_LIBHBDIR $libdir/heartbeat` AC_DEFINE_UNQUOTED(HB_DAEMON_DIR,"$HB_DAEMON_DIR", Location for Heartbeat expects Pacemaker daemons to be in) AC_SUBST(HB_DAEMON_DIR) dnl Needed so that the Corosync plugin can clear out the directory as Heartbeat does HA_STATE_DIR=`try_extract_header_define $GLUE_HEADER HA_VARRUNDIR ${localstatedir}/run` AC_DEFINE_UNQUOTED(HA_STATE_DIR,"$HA_STATE_DIR", Where Heartbeat keeps state files and sockets) AC_SUBST(HA_STATE_DIR) CRM_RSCTMP_DIR=`try_extract_header_define agent_config.h HA_RSCTMPDIR $HA_STATE_DIR/heartbeat/rsctmp` AC_MSG_CHECKING(Scratch dir for resource agents) AC_MSG_RESULT($CRM_RSCTMP_DIR) AC_DEFINE_UNQUOTED(CRM_RSCTMP_DIR,"$CRM_RSCTMP_DIR", Where resource agents should keep state files) AC_SUBST(CRM_RSCTMP_DIR) dnl Needed for the location of hostcache in CTS.py HA_VARLIBHBDIR=`try_extract_header_define $GLUE_HEADER HA_VARLIBHBDIR ${localstatedir}/lib/heartbeat` AC_SUBST(HA_VARLIBHBDIR) AC_DEFINE_UNQUOTED(UUID_FILE,"$localstatedir/lib/heartbeat/hb_uuid", Location of Heartbeat's UUID file) OCF_ROOT_DIR=`try_extract_header_define $GLUE_HEADER OCF_ROOT_DIR /usr/lib/ocf` if test "X$OCF_ROOT_DIR" = X; then AC_MSG_ERROR(Could not locate OCF directory) fi AC_SUBST(OCF_ROOT_DIR) OCF_RA_DIR=`try_extract_header_define $GLUE_HEADER OCF_RA_DIR $OCF_ROOT_DIR/resource.d` AC_DEFINE_UNQUOTED(OCF_RA_DIR,"$OCF_RA_DIR", Location for OCF RAs) AC_SUBST(OCF_RA_DIR) RH_STONITH_DIR="$sbindir" AC_DEFINE_UNQUOTED(RH_STONITH_DIR,"$RH_STONITH_DIR", Location for Red Hat Stonith agents) RH_STONITH_PREFIX="fence_" AC_DEFINE_UNQUOTED(RH_STONITH_PREFIX,"$RH_STONITH_PREFIX", Prefix for Red Hat Stonith agents) AC_PATH_PROGS(GIT, git false) AC_MSG_CHECKING(build version) BUILD_VERSION=$Format:%h$ if test $BUILD_VERSION != ":%h$"; then AC_MSG_RESULT(archive hash: $BUILD_VERSION) elif test -x $GIT -a -d .git; then BUILD_VERSION=`$GIT log --pretty="format:%h" -n 1` AC_MSG_RESULT(git hash: $BUILD_VERSION) else # The current directory name make a reasonable default # Most generated archives will include the hash or tag BASE=`basename $PWD` BUILD_VERSION=`echo $BASE | sed s:.*[[Pp]]acemaker-::` AC_MSG_RESULT(directory based hash: $BUILD_VERSION) fi AC_DEFINE_UNQUOTED(BUILD_VERSION, "$BUILD_VERSION", Build version) AC_SUBST(BUILD_VERSION) HAVE_gio=1 HAVE_upstart=0 HAVE_systemd=0 PKG_CHECK_MODULES(GIO, gio-2.0, ,HAVE_gio=0) AC_CHECK_TYPE([GDBusProxy],,,[[#include ]]) if test x$ac_cv_type_GDBusProxy != xyes; then HAVE_gio=0 AC_MSG_WARN(Unable to support systemd/upstart. You need to use glib >= 2.26) fi if test $HAVE_gio = 1 -a "x${enable_upstart}" != xno; then HAVE_upstart=1 PCMK_FEATURES="$PCMK_FEATURES upstart" fi AC_DEFINE_UNQUOTED(SUPPORT_UPSTART, $HAVE_upstart, Support upstart based system services) AM_CONDITIONAL(BUILD_UPSTART, test $HAVE_upstart = 1) if test $HAVE_gio = 1 -a "x${enable_systemd}" != xno; then HAVE_systemd=1 PCMK_FEATURES="$PCMK_FEATURES systemd" fi AC_DEFINE_UNQUOTED(SUPPORT_SYSTEMD, $HAVE_systemd, Support systemd based system services) AM_CONDITIONAL(BUILD_SYSTEMD, test $HAVE_systemd = 1) STACKS="" CLUSTERLIBS="" dnl ======================================================================== dnl Cluster stack - Heartbeat dnl ======================================================================== case $SUPPORT_HEARTBEAT in 1|yes|true|try) AC_MSG_CHECKING(for heartbeat support) AC_CHECK_LIB(hbclient, ll_cluster_new, [SUPPORT_HEARTBEAT=1], [if test $SUPPORT_HEARTBEAT != try; then AC_MSG_FAILURE(Unable to support Heartbeat: client libraries not found) fi]) if test $SUPPORT_HEARTBEAT = 1 ; then STACKS="$STACKS heartbeat" dnl objdump -x ${libdir}/libccmclient.so | grep SONAME | awk '{print $2}' AC_DEFINE_UNQUOTED(CCM_LIBRARY, "libccmclient.so.1", Library to load for ccm support) AC_DEFINE_UNQUOTED(HEARTBEAT_LIBRARY, "libhbclient.so.1", Library to load for heartbeat support) else SUPPORT_HEARTBEAT=0 fi ;; *) SUPPORT_HEARTBEAT=0;; esac AM_CONDITIONAL(BUILD_HEARTBEAT_SUPPORT, test $SUPPORT_HEARTBEAT = 1) AC_DEFINE_UNQUOTED(SUPPORT_HEARTBEAT, $SUPPORT_HEARTBEAT, Support the Heartbeat messaging and membership layer) AC_SUBST(SUPPORT_HEARTBEAT) dnl ======================================================================== dnl Cluster stack - Corosync dnl ======================================================================== dnl Normalize the values case $SUPPORT_CS in 1|yes|true) SUPPORT_CS=yes missingisfatal=1;; try) missingisfatal=0;; *) SUPPORT_CS=no;; esac AC_MSG_CHECKING(for native corosync) COROSYNC_LIBS="" CS_USES_LIBQB=0 PCMK_SERVICE_ID=9 LCRSODIR="$libdir" if test $SUPPORT_CS = no; then AC_MSG_RESULT(no (disabled)) SUPPORT_CS=0 else AC_MSG_RESULT($SUPPORT_CS, with '$CSPREFIX') PKG_CHECK_MODULES(cpg, libcpg) dnl Fatal PKG_CHECK_MODULES(cfg, libcfg) dnl Fatal PKG_CHECK_MODULES(cmap, libcmap, HAVE_cmap=1, HAVE_cmap=0) PKG_CHECK_MODULES(cman, libcman, HAVE_cman=1, HAVE_cman=0) PKG_CHECK_MODULES(confdb, libconfdb, HAVE_confdb=1, HAVE_confdb=0) PKG_CHECK_MODULES(fenced, libfenced, HAVE_fenced=1, HAVE_fenced=0) PKG_CHECK_MODULES(quorum, libquorum, HAVE_quorum=1, HAVE_quorum=0) PKG_CHECK_MODULES(oldipc, libcoroipcc, HAVE_oldipc=1, HAVE_oldipc=0) if test $HAVE_oldipc = 1; then SUPPORT_CS=1 CFLAGS="$CFLAGS $oldipc_FLAGS $cpg_FLAGS $cfg_FLAGS" COROSYNC_LIBS="$COROSYNC_LIBS $oldipc_LIBS $cpg_LIBS $cfg_LIBS" elif test $HAVE_libqb = 1; then SUPPORT_CS=1 CS_USES_LIBQB=1 CFLAGS="$CFLAGS $libqb_FLAGS $cpg_FLAGS $cfg_FLAGS" COROSYNC_LIBS="$COROSYNC_LIBS $libqb_LIBS $cpg_LIBS $cfg_LIBS" AC_CHECK_LIB(corosync_common, cs_strerror) else aisreason="corosync/libqb IPC libraries not found by pkg_config" fi AC_DEFINE_UNQUOTED(HAVE_CONFDB, $HAVE_confdb, Have the old herarchial Corosync config API) AC_DEFINE_UNQUOTED(HAVE_CMAP, $HAVE_cmap, Have the new non-herarchial Corosync config API) fi if test $SUPPORT_CS = 1 -a x$HAVE_oldipc = x0 ; then dnl Support for plugins was removed about the time the IPC was dnl moved to libqb. dnl The only option now is the built-in quorum API CFLAGS="$CFLAGS $cmap_CFLAGS $quorum_CFLAGS" COROSYNC_LIBS="$COROSYNC_LIBS $cmap_LIBS $quorum_LIBS" STACKS="$STACKS corosync-native" AC_DEFINE_UNQUOTED(SUPPORT_CS_QUORUM, 1, Support the consumption of membership and quorum from corosync) fi SUPPORT_PLUGIN=0 if test $SUPPORT_CS = 1 -a x$HAVE_confdb = x1; then dnl Need confdb to support cman and the plugins SUPPORT_PLUGIN=1 LCRSODIR=`$PKGCONFIG corosync --variable=lcrsodir` STACKS="$STACKS corosync-plugin" COROSYNC_LIBS="$COROSYNC_LIBS $confdb_LIBS" if test $SUPPORT_CMAN != no; then if test $HAVE_cman = 1 -a $HAVE_fenced = 1; then SUPPORT_CMAN=1 STACKS="$STACKS cman" CFLAGS="$CFLAGS $cman_FLAGS $fenced_FLAGS" COROSYNC_LIBS="$COROSYNC_LIBS $cman_LIBS $fenced_LIBS" fi fi fi dnl Normalize SUPPORT_CS and SUPPORT_CMAN for use with #if directives if test $SUPPORT_CMAN != 1; then SUPPORT_CMAN=0 fi if test $SUPPORT_CS = 1; then CLUSTERLIBS="$CLUSTERLIBS $COROSYNC_LIBS" elif test $SUPPORT_CS != 0; then SUPPORT_CS=0 if test $missingisfatal = 0; then AC_MSG_WARN(Unable to support Corosync: $aisreason) else AC_MSG_FAILURE(Unable to support Corosync: $aisreason) fi fi AC_DEFINE_UNQUOTED(SUPPORT_COROSYNC, $SUPPORT_CS, Support the Corosync messaging and membership layer) AC_DEFINE_UNQUOTED(SUPPORT_CMAN, $SUPPORT_CMAN, Support the consumption of membership and quorum from cman) AC_DEFINE_UNQUOTED(CS_USES_LIBQB, $CS_USES_LIBQB, Does corosync use libqb for its ipc) AC_DEFINE_UNQUOTED(PCMK_SERVICE_ID, $PCMK_SERVICE_ID, Corosync service number) AC_DEFINE_UNQUOTED(SUPPORT_PLUGIN, $SUPPORT_PLUGIN, Support the Pacemaker plugin for Corosync) AM_CONDITIONAL(BUILD_CS_SUPPORT, test $SUPPORT_CS = 1) AM_CONDITIONAL(BUILD_CS_PLUGIN, test $SUPPORT_PLUGIN = 1) AC_SUBST(SUPPORT_CMAN) AC_SUBST(SUPPORT_CS) dnl dnl Cluster stack - Sanity dnl if test x${enable_no_stack} = xyes; then AC_MSG_NOTICE(No cluster stack supported. Just building the Policy Engine) PCMK_FEATURES="$PCMK_FEATURES no-cluster-stack" else AC_MSG_CHECKING(for supported stacks) if test x"$STACKS" = x; then AC_MSG_FAILURE(You must support at least one cluster stack (heartbeat or corosync) ) fi AC_MSG_RESULT($STACKS) PCMK_FEATURES="$PCMK_FEATURES $STACKS" fi AC_SUBST(CLUSTERLIBS) AC_SUBST(LCRSODIR) dnl ======================================================================== dnl SNMP dnl ======================================================================== case $SUPPORT_SNMP in 1|yes|true) missingisfatal=1;; try) missingisfatal=0;; *) SUPPORT_SNMP=no;; esac SNMPLIBS="" AC_MSG_CHECKING(for snmp support) if test $SUPPORT_SNMP = no; then AC_MSG_RESULT(no (disabled)) SUPPORT_SNMP=0 else SNMPCONFIG="" AC_MSG_RESULT($SUPPORT_SNMP) AC_CHECK_HEADERS(net-snmp/net-snmp-config.h) if test "x${ac_cv_header_net_snmp_net_snmp_config_h}" != "xyes"; then SUPPORT_SNMP="no" fi if test $SUPPORT_SNMP != no; then AC_PATH_PROGS(SNMPCONFIG, net-snmp-config) if test "X${SNMPCONFIG}" = "X"; then AC_MSG_RESULT(You need the net_snmp development package to continue.) SUPPORT_SNMP=no fi fi if test $SUPPORT_SNMP != no; then AC_MSG_CHECKING(for special snmp libraries) SNMPLIBS=`$SNMPCONFIG --agent-libs` AC_MSG_RESULT($SNMPLIBS) fi if test $SUPPORT_SNMP != no; then savedLibs=$LIBS LIBS="$LIBS $SNMPLIBS" dnl On many systems libcrypto is needed when linking against libsnmp. dnl Check to see if it exists, and if so use it. dnl AC_CHECK_LIB(crypto, CRYPTO_free, CRYPTOLIB="-lcrypto",) dnl AC_SUBST(CRYPTOLIB) AC_CHECK_FUNCS(netsnmp_transport_open_client) if test $ac_cv_func_netsnmp_transport_open_client != yes; then AC_CHECK_FUNCS(netsnmp_tdomain_transport) if test $ac_cv_func_netsnmp_tdomain_transport != yes; then SUPPORT_SNMP=no else AC_DEFINE_UNQUOTED(NETSNMPV53, 1, [Use the older 5.3 version of the net-snmp API]) fi fi LIBS=$savedLibs fi if test $SUPPORT_SNMP = no; then SNMPLIBS="" SUPPORT_SNMP=0 if test $missingisfatal = 0; then AC_MSG_WARN(Unable to support SNMP) else AC_MSG_FAILURE(Unable to support SNMP) fi else SUPPORT_SNMP=1 fi fi if test $SUPPORT_SNMP = 1; then PCMK_FEATURES="$PCMK_FEATURES snmp" fi AC_SUBST(SNMPLIBS) AM_CONDITIONAL(ENABLE_SNMP, test "$SUPPORT_SNMP" = "1") AC_DEFINE_UNQUOTED(ENABLE_SNMP, $SUPPORT_SNMP, Build in support for sending SNMP traps) dnl ======================================================================== dnl ESMTP dnl ======================================================================== case $SUPPORT_ESMTP in 1|yes|true) missingisfatal=1;; try) missingisfatal=0;; *) SUPPORT_ESMTP=no;; esac ESMTPLIB="" AC_MSG_CHECKING(for esmtp support) if test $SUPPORT_ESMTP = no; then AC_MSG_RESULT(no (disabled)) SUPPORT_ESMTP=0 else ESMTPCONFIG="" AC_MSG_RESULT($SUPPORT_ESMTP) AC_CHECK_HEADERS(libesmtp.h) if test "x${ac_cv_header_libesmtp_h}" != "xyes"; then ENABLE_ESMTP="no" fi if test $SUPPORT_ESMTP != no; then AC_PATH_PROGS(ESMTPCONFIG, libesmtp-config) if test "X${ESMTPCONFIG}" = "X"; then AC_MSG_RESULT(You need the libesmtp development package to continue.) SUPPORT_ESMTP=no fi fi if test $SUPPORT_ESMTP != no; then AC_MSG_CHECKING(for special esmtp libraries) ESMTPLIBS=`$ESMTPCONFIG --libs | tr '\n' ' '` AC_MSG_RESULT($ESMTPLIBS) fi if test $SUPPORT_ESMTP = no; then SUPPORT_ESMTP=0 if test $missingisfatal = 0; then AC_MSG_WARN(Unable to support ESMTP) else AC_MSG_FAILURE(Unable to support ESMTP) fi else SUPPORT_ESMTP=1 PCMK_FEATURES="$PCMK_FEATURES libesmtp" fi fi AC_SUBST(ESMTPLIBS) AM_CONDITIONAL(ENABLE_ESMTP, test "$SUPPORT_ESMTP" = "1") AC_DEFINE_UNQUOTED(ENABLE_ESMTP, $SUPPORT_ESMTP, Build in support for sending mail notifications with ESMTP) dnl ======================================================================== dnl ACL dnl ======================================================================== case $SUPPORT_ACL in 1|yes|true) missingisfatal=1;; try) missingisfatal=0;; *) SUPPORT_ACL=no;; esac +HAVE_QB_IPCC_READY=1 +AC_CHECK_LIB(qb, qb_ipcc_ready) +if test $ac_cv_lib_qb_qb_ipcc_ready != yes; then + HAVE_QB_IPCC_READY=0 +fi + +AC_DEFINE_UNQUOTED(HAVE_QB_IPCC_READY, $HAVE_QB_IPCC_READY, Do we need to perform nanosleep() when looking for synchronous replies) + AC_MSG_CHECKING(for acl support) if test $SUPPORT_ACL = no; then AC_MSG_RESULT(no (disabled)) SUPPORT_ACL=0 else AC_MSG_RESULT($SUPPORT_ACL) + SUPPORT_ACL=1 AC_CHECK_LIB(qb, qb_ipcs_connection_auth_set) if test $ac_cv_lib_qb_qb_ipcs_connection_auth_set != yes; then SUPPORT_ACL=0 - else - SUPPORT_ACL=1 fi if test $SUPPORT_ACL = 0; then if test $missingisfatal = 0; then AC_MSG_WARN(Unable to support ACL. You need to use libqb > 0.13.0) else AC_MSG_FAILURE(Unable to support ACL. You need to use libqb > 0.13.0) fi fi fi if test $SUPPORT_ACL = 1; then PCMK_FEATURES="$PCMK_FEATURES acls" fi AM_CONDITIONAL(ENABLE_ACL, test "$SUPPORT_ACL" = "1") AC_DEFINE_UNQUOTED(ENABLE_ACL, $SUPPORT_ACL, Build in support for CIB ACL) dnl ======================================================================== dnl GnuTLS dnl ======================================================================== AC_CHECK_HEADERS(gnutls/gnutls.h) AC_CHECK_HEADERS(security/pam_appl.h pam/pam_appl.h) dnl GNUTLS library: Attempt to determine by 'libgnutls-config' program. dnl If no 'libgnutls-config', try traditional autoconf means. AC_PATH_PROGS(LIBGNUTLS_CONFIG, libgnutls-config) if test -n "$LIBGNUTLS_CONFIG"; then AC_MSG_CHECKING(for gnutls header flags) GNUTLSHEAD="`$LIBGNUTLS_CONFIG --cflags`"; AC_MSG_RESULT($GNUTLSHEAD) AC_MSG_CHECKING(for gnutls library flags) GNUTLSLIBS="`$LIBGNUTLS_CONFIG --libs`"; AC_MSG_RESULT($GNUTLSLIBS) fi AC_CHECK_LIB(gnutls, gnutls_init) AC_CHECK_FUNCS(gnutls_priority_set_direct) AC_SUBST(GNUTLSHEAD) AC_SUBST(GNUTLSLIBS) dnl ======================================================================== dnl System Health dnl ======================================================================== dnl Check if servicelog development package is installed SERVICELOG=servicelog-1 SERVICELOG_EXISTS="no" AC_MSG_CHECKING(for $SERVICELOG packages) if $PKGCONFIG --exists $SERVICELOG then PKG_CHECK_MODULES([SERVICELOG], [servicelog-1]) SERVICELOG_EXISTS="yes" fi AC_MSG_RESULT($SERVICELOG_EXISTS) AM_CONDITIONAL(BUILD_SERVICELOG, test "$SERVICELOG_EXISTS" = "yes") dnl Check if OpenIMPI packages and servicelog are installed OPENIPMI="OpenIPMI OpenIPMIposix" OPENIPMI_SERVICELOG_EXISTS="no" AC_MSG_CHECKING(for $SERVICELOG $OPENIPMI packages) if $PKGCONFIG --exists $OPENIPMI $SERVICELOG then PKG_CHECK_MODULES([OPENIPMI_SERVICELOG],[OpenIPMI OpenIPMIposix]) OPENIPMI_SERVICELOG_EXISTS="yes" fi AC_MSG_RESULT($OPENIPMI_SERVICELOG_EXISTS) AM_CONDITIONAL(BUILD_OPENIPMI_SERVICELOG, test "$OPENIPMI_SERVICELOG_EXISTS" = "yes") dnl ======================================================================== dnl Compiler flags dnl ======================================================================== dnl Make sure that CFLAGS is not exported. If the user did dnl not have CFLAGS in their environment then this should have dnl no effect. However if CFLAGS was exported from the user's dnl environment, then the new CFLAGS will also be exported dnl to sub processes. CC_ERRORS="" CC_EXTRAS="" if export | fgrep " CFLAGS=" > /dev/null; then SAVED_CFLAGS="$CFLAGS" unset CFLAGS CFLAGS="$SAVED_CFLAGS" unset SAVED_CFLAGS fi if test "$GCC" != yes; then CFLAGS="$CFLAGS -g" enable_fatal_warnings=no else CFLAGS="$CFLAGS -ggdb" # We had to eliminate -Wnested-externs because of libtool changes EXTRA_FLAGS="-fgnu89-inline -fstack-protector-all -Wall -Waggregate-return -Wbad-function-cast -Wcast-align -Wdeclaration-after-statement -Wendif-labels -Wfloat-equal -Wformat=2 -Wformat-security -Wformat-nonliteral -Wmissing-prototypes -Wmissing-declarations -Wnested-externs -Wno-long-long -Wno-strict-aliasing -Wunused-but-set-variable -Wpointer-arith -Wstrict-prototypes -Wunsigned-char -Wwrite-strings" # Additional warnings it might be nice to enable one day # -Wshadow # -Wunreachable-code for j in $EXTRA_FLAGS do if cc_supports_flag $j then CC_EXTRAS="$CC_EXTRAS $j" fi done dnl In lib/ais/Makefile.am there's a gcc option available as of v4.x GCC_MAJOR=`gcc -v 2>&1 | awk 'END{print $3}' | sed 's/[.].*//'` AM_CONDITIONAL(GCC_4, test "${GCC_MAJOR}" = 4) dnl System specific options case "$host_os" in *linux*|*bsd*) if test "${enable_fatal_warnings}" = "unknown"; then enable_fatal_warnings=yes fi ;; esac if test "x${enable_fatal_warnings}" != xno && cc_supports_flag -Werror ; then enable_fatal_warnings=yes else enable_fatal_warnings=no fi if test "x${enable_ansi}" = xyes && cc_supports_flag -std=iso9899:199409 ; then AC_MSG_NOTICE(Enabling ANSI Compatibility) CC_EXTRAS="$CC_EXTRAS -ansi -D_GNU_SOURCE -DANSI_ONLY" fi AC_MSG_NOTICE(Activated additional gcc flags: ${CC_EXTRAS}) fi CFLAGS="$CFLAGS $CC_EXTRAS" NON_FATAL_CFLAGS="$CFLAGS" AC_SUBST(NON_FATAL_CFLAGS) dnl dnl We reset CFLAGS to include our warnings *after* all function dnl checking goes on, so that our warning flags don't keep the dnl AC_*FUNCS() calls above from working. In particular, -Werror will dnl *always* cause us troubles if we set it before here. dnl dnl if test "x${enable_fatal_warnings}" = xyes ; then AC_MSG_NOTICE(Enabling Fatal Warnings) CFLAGS="$CFLAGS -Werror" fi AC_SUBST(CFLAGS) dnl This is useful for use in Makefiles that need to remove one specific flag CFLAGS_COPY="$CFLAGS" AC_SUBST(CFLAGS_COPY) AC_SUBST(LIBADD_DL) dnl extra flags for dynamic linking libraries AC_SUBST(LIBADD_INTL) dnl extra flags for GNU gettext stuff... AC_SUBST(LOCALE) dnl Options for cleaning up the compiler output QUIET_LIBTOOL_OPTS="" QUIET_MAKE_OPTS="" if test "x${enable_quiet}" = "xyes"; then QUIET_LIBTOOL_OPTS="--quiet" QUIET_MAKE_OPTS="--quiet" fi AC_MSG_RESULT(Supress make details: ${enable_quiet}) dnl Put the above variables to use LIBTOOL="${LIBTOOL} --tag=CC \$(QUIET_LIBTOOL_OPTS)" MAKE="${MAKE} \$(QUIET_MAKE_OPTS)" AC_SUBST(CC) AC_SUBST(MAKE) AC_SUBST(LIBTOOL) AC_SUBST(QUIET_MAKE_OPTS) AC_SUBST(QUIET_LIBTOOL_OPTS) AC_DEFINE_UNQUOTED(CRM_FEATURES, "$PCMK_FEATURES", Set of enabled features) AC_SUBST(PCMK_FEATURES) dnl The Makefiles and shell scripts we output AC_CONFIG_FILES(Makefile \ +Doxyfile \ coverage.sh \ cts/Makefile \ cts/CTSvars.py \ cts/LSBDummy \ cts/benchmark/Makefile \ cts/benchmark/clubench \ cib/Makefile \ crmd/Makefile \ pengine/Makefile \ pengine/regression.core.sh \ doc/Makefile \ doc/Pacemaker_Explained/publican.cfg \ doc/Clusters_from_Scratch/publican.cfg \ include/Makefile \ include/crm/Makefile \ include/crm/cib/Makefile \ include/crm/common/Makefile \ include/crm/cluster/Makefile \ include/crm/fencing/Makefile \ include/crm/pengine/Makefile \ replace/Makefile \ lib/Makefile \ lib/pcmk.pc \ lib/pcmk-pe.pc \ lib/pcmk-cib.pc \ lib/ais/Makefile \ lib/common/Makefile \ lib/cluster/Makefile \ lib/cib/Makefile \ lib/pengine/Makefile \ lib/transition/Makefile \ lib/fencing/Makefile \ lib/lrmd/Makefile \ lib/services/Makefile \ mcp/Makefile \ mcp/pacemaker \ mcp/pacemaker.service \ fencing/Makefile \ fencing/regression.py \ lrmd/Makefile \ lrmd/regression.py \ extra/Makefile \ extra/resources/Makefile \ extra/rgmanager/Makefile \ tools/Makefile \ tools/crm_report \ xml/Makefile \ lib/gnu/Makefile \ ) dnl Now process the entire list of files added by previous dnl calls to AC_CONFIG_FILES() AC_OUTPUT() dnl ***************** dnl Configure summary dnl ***************** AC_MSG_RESULT([]) AC_MSG_RESULT([$PACKAGE configuration:]) AC_MSG_RESULT([ Version = ${VERSION} (Build: $BUILD_VERSION)]) AC_MSG_RESULT([ Features =${PCMK_FEATURES}]) AC_MSG_RESULT([]) AC_MSG_RESULT([ Prefix = ${prefix}]) AC_MSG_RESULT([ Executables = ${sbindir}]) AC_MSG_RESULT([ Man pages = ${mandir}]) AC_MSG_RESULT([ Libraries = ${libdir}]) AC_MSG_RESULT([ Header files = ${includedir}]) AC_MSG_RESULT([ Arch-independent files = ${datadir}]) AC_MSG_RESULT([ State information = ${localstatedir}]) AC_MSG_RESULT([ System configuration = ${sysconfdir}]) AC_MSG_RESULT([ Corosync Plugins = ${LCRSODIR}]) AC_MSG_RESULT([]) AC_MSG_RESULT([ Use system LTDL = ${ac_cv_lib_ltdl_lt_dlopen}]) AC_MSG_RESULT([]) AC_MSG_RESULT([ HA group name = ${CRM_DAEMON_GROUP}]) AC_MSG_RESULT([ HA user name = ${CRM_DAEMON_USER}]) AC_MSG_RESULT([]) AC_MSG_RESULT([ CFLAGS = ${CFLAGS}]) AC_MSG_RESULT([ Libraries = ${LIBS}]) AC_MSG_RESULT([ Stack Libraries = ${CLUSTERLIBS}]) diff --git a/crmd/te_actions.c b/crmd/te_actions.c index ca02fe5e53..44398e3c04 100644 --- a/crmd/te_actions.c +++ b/crmd/te_actions.c @@ -1,528 +1,528 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include char *te_uuid = NULL; void send_rsc_command(crm_action_t * action); static void te_start_action_timer(crm_graph_t * graph, crm_action_t * action) { action->timer = calloc(1, sizeof(crm_action_timer_t)); action->timer->timeout = action->timeout; action->timer->reason = timeout_action; action->timer->action = action; action->timer->source_id = g_timeout_add(action->timer->timeout + graph->network_delay, action_timer_callback, (void *)action->timer); CRM_ASSERT(action->timer->source_id != 0); } static gboolean te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo) { crm_debug("Pseudo action %d fired and confirmed", pseudo->id); pseudo->confirmed = TRUE; update_graph(graph, pseudo); trigger_graph(); return TRUE; } void send_stonith_update(crm_action_t * action, const char *target, const char *uuid) { int rc = pcmk_ok; crm_node_t *peer = NULL; /* zero out the node-status & remove all LRM status info */ xmlNode *node_state = NULL; CRM_CHECK(target != NULL, return); CRM_CHECK(uuid != NULL, return); if(get_node_uuid(0, target) == NULL) { set_node_uuid(target, uuid); } /* Make sure the membership and join caches are accurate */ peer = crm_get_peer(0, target); if(peer->uuid == NULL) { crm_info("Recording uuid '%s' for node '%s'", uuid, target); peer->uuid = strdup(uuid); } crm_update_peer_proc(__FUNCTION__, peer, crm_proc_none, NULL); crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_LOST, 0); crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN); erase_node_from_join(target); node_state = do_update_node_cib(peer, node_update_cluster|node_update_peer|node_update_join|node_update_expected, NULL, __FUNCTION__); /* Force our known ID */ crm_xml_add(node_state, XML_ATTR_UUID, uuid); rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state, cib_quorum_override | cib_scope_local | cib_can_create); /* Delay processing the trigger until the update completes */ crm_debug("Sending fencing update %d for %s", rc, target); add_cib_op_callback(fsa_cib_conn, rc, FALSE, strdup(target), cib_fencing_updated); /* Make sure it sticks */ /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local); */ erase_status_tag(target, XML_CIB_TAG_LRM, cib_scope_local); erase_status_tag(target, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); free_xml(node_state); return; } static gboolean te_fence_node(crm_graph_t * graph, crm_action_t * action) { int rc = 0; const char *id = NULL; const char *uuid = NULL; const char *target = NULL; const char *type = NULL; gboolean invalid_action = FALSE; enum stonith_call_options options = st_opt_none; id = ID(action->xml); target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); type = crm_meta_value(action->params, "stonith_action"); CRM_CHECK(id != NULL, invalid_action = TRUE); CRM_CHECK(uuid != NULL, invalid_action = TRUE); CRM_CHECK(type != NULL, invalid_action = TRUE); CRM_CHECK(target != NULL, invalid_action = TRUE); if (invalid_action) { crm_log_xml_warn(action->xml, "BadAction"); return FALSE; } crm_notice("Executing %s fencing operation (%s) on %s (timeout=%d)", type, id, target, transition_graph->stonith_timeout); /* Passing NULL means block until we can connect... */ te_connect_stonith(NULL); if (confirmed_nodes && g_hash_table_size(confirmed_nodes) == 1) { options |= st_opt_allow_suicide; } rc = stonith_api->cmds->fence(stonith_api, options, target, type, transition_graph->stonith_timeout / 1000, 0); stonith_api->cmds->register_callback( stonith_api, rc, transition_graph->stonith_timeout / 1000, st_opt_timeout_updates, generate_transition_key(transition_graph->id, action->id, 0, te_uuid), "tengine_stonith_callback", tengine_stonith_callback); return TRUE; } static int get_target_rc(crm_action_t * action) { const char *target_rc_s = crm_meta_value(action->params, XML_ATTR_TE_TARGET_RC); if (target_rc_s != NULL) { return crm_parse_int(target_rc_s, "0"); } return 0; } static gboolean te_crm_command(crm_graph_t * graph, crm_action_t * action) { char *counter = NULL; xmlNode *cmd = NULL; gboolean is_local = FALSE; const char *id = NULL; const char *task = NULL; const char *value = NULL; const char *on_node = NULL; gboolean rc = TRUE; gboolean no_wait = FALSE; id = ID(action->xml); task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); CRM_CHECK(on_node != NULL && strlen(on_node) != 0, crm_err( "Corrupted command (id=%s) %s: no node", crm_str(id), crm_str(task)); return FALSE); crm_info( "Executing crm-event (%s): %s on %s%s%s", crm_str(id), crm_str(task), on_node, is_local ? " (local)" : "", no_wait ? " - no waiting" : ""); if (safe_str_eq(on_node, fsa_our_uname)) { is_local = TRUE; } value = crm_meta_value(action->params, XML_ATTR_TE_NOWAIT); if (crm_is_true(value)) { no_wait = TRUE; } if (is_local && safe_str_eq(task, CRM_OP_SHUTDOWN)) { /* defer until everything else completes */ crm_info( "crm-event (%s) is a local shutdown", crm_str(id)); graph->completion_action = tg_shutdown; graph->abort_reason = "local shutdown"; action->confirmed = TRUE; update_graph(graph, action); trigger_graph(); return TRUE; } else if(safe_str_eq(task, CRM_OP_SHUTDOWN)) { crm_node_t *peer = crm_get_peer(0, on_node); crm_update_peer_expected(__FUNCTION__, peer, CRMD_JOINSTATE_DOWN); } cmd = create_request(task, action->xml, on_node, CRM_SYSTEM_CRMD, CRM_SYSTEM_TENGINE, NULL); counter = generate_transition_key(transition_graph->id, action->id, get_target_rc(action), te_uuid); crm_xml_add(cmd, XML_ATTR_TRANSITION_KEY, counter); rc = send_cluster_message(crm_get_peer(0, on_node), crm_msg_crmd, cmd, TRUE); free(counter); free_xml(cmd); if (rc == FALSE) { crm_err("Action %d failed: send", action->id); return FALSE; } else if (no_wait) { action->confirmed = TRUE; update_graph(graph, action); trigger_graph(); } else { if (action->timeout <= 0) { crm_err("Action %d: %s on %s had an invalid timeout (%dms). Using %dms instead", action->id, task, on_node, action->timeout, graph->network_delay); action->timeout = graph->network_delay; } te_start_action_timer(graph, action); } return TRUE; } gboolean cib_action_update(crm_action_t * action, int status, int op_rc) { lrmd_event_data_t *op = NULL; xmlNode *state = NULL; xmlNode *rsc = NULL; xmlNode *xml_op = NULL; xmlNode *action_rsc = NULL; int rc = pcmk_ok; const char *name = NULL; const char *value = NULL; const char *rsc_id = NULL; const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); const char *task_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); const char *target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); int call_options = cib_quorum_override | cib_scope_local; int target_rc = get_target_rc(action); if (status == PCMK_LRM_OP_PENDING) { crm_debug("%s %d: Recording pending operation %s on %s", crm_element_name(action->xml), action->id, task_uuid, target); } else { crm_warn("%s %d: %s on %s timed out", crm_element_name(action->xml), action->id, task_uuid, target); } action_rsc = find_xml_node(action->xml, XML_CIB_TAG_RESOURCE, TRUE); if (action_rsc == NULL) { return FALSE; } rsc_id = ID(action_rsc); CRM_CHECK(rsc_id != NULL, crm_log_xml_err(action->xml, "Bad:action"); return FALSE); /* update the CIB */ state = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_xml_add(state, XML_ATTR_UUID, target_uuid); crm_xml_add(state, XML_ATTR_UNAME, target); rsc = create_xml_node(state, XML_CIB_TAG_LRM); crm_xml_add(rsc, XML_ATTR_ID, target_uuid); rsc = create_xml_node(rsc, XML_LRM_TAG_RESOURCES); rsc = create_xml_node(rsc, XML_LRM_TAG_RESOURCE); crm_xml_add(rsc, XML_ATTR_ID, rsc_id); name = XML_ATTR_TYPE; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); name = XML_AGENT_ATTR_CLASS; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); name = XML_AGENT_ATTR_PROVIDER; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); op = convert_graph_action(NULL, action, status, op_rc); op->call_id = -1; op->user_data = generate_transition_key(transition_graph->id, action->id, target_rc, te_uuid); xml_op = create_operation_update(rsc, op, CRM_FEATURE_SET, target_rc, __FUNCTION__, LOG_INFO); lrmd_free_event(op); crm_trace("Updating CIB with \"%s\" (%s): %s %s on %s", status < 0 ? "new action" : XML_ATTR_TIMEOUT, crm_element_name(action->xml), crm_str(task), rsc_id, target); crm_log_xml_trace(xml_op, "Op"); rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, state, call_options); crm_trace("Updating CIB with %s action %d: %s on %s (call_id=%d)", services_lrm_status_str(status), action->id, task_uuid, target, rc); add_cib_op_callback(fsa_cib_conn, rc, FALSE, NULL, cib_action_updated); free_xml(state); action->sent_update = TRUE; if (rc < pcmk_ok) { return FALSE; } return TRUE; } static gboolean te_rsc_command(crm_graph_t * graph, crm_action_t * action) { /* never overwrite stop actions in the CIB with * anything other than completed results * * Writing pending stops makes it look like the * resource is running again */ xmlNode *cmd = NULL; xmlNode *rsc_op = NULL; gboolean rc = TRUE; gboolean no_wait = FALSE; gboolean is_local = FALSE; char *counter = NULL; const char *task = NULL; const char *value = NULL; const char *on_node = NULL; const char *task_uuid = NULL; CRM_ASSERT(action != NULL); CRM_ASSERT(action->xml != NULL); action->executed = FALSE; on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); CRM_CHECK(on_node != NULL && strlen(on_node) != 0, crm_err( "Corrupted command(id=%s) %s: no node", ID(action->xml), crm_str(task)); return FALSE); rsc_op = action->xml; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); task_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); on_node = crm_element_value(rsc_op, XML_LRM_ATTR_TARGET); counter = generate_transition_key(transition_graph->id, action->id, get_target_rc(action), te_uuid); crm_xml_add(rsc_op, XML_ATTR_TRANSITION_KEY, counter); if (safe_str_eq(on_node, fsa_our_uname)) { is_local = TRUE; } value = crm_meta_value(action->params, XML_ATTR_TE_NOWAIT); if (crm_is_true(value)) { no_wait = TRUE; } crm_info("Initiating action %d: %s %s on %s%s%s", action->id, task, task_uuid, on_node, is_local ? " (local)" : "", no_wait ? " - no waiting" : ""); cmd = create_request(CRM_OP_INVOKE_LRM, rsc_op, on_node, CRM_SYSTEM_LRMD, CRM_SYSTEM_TENGINE, NULL); if (is_local) { /* shortcut local resource commands */ ha_msg_input_t data = { .msg = cmd, .xml = rsc_op, }; fsa_data_t msg = { .id = 0, .data = &data, .data_type = fsa_dt_ha_msg, .fsa_input = I_NULL, .fsa_cause = C_FSA_INTERNAL, .actions = A_LRM_INVOKE, .origin = __FUNCTION__, }; do_lrm_invoke(A_LRM_INVOKE, C_FSA_INTERNAL, fsa_state, I_NULL, &msg); } else { rc = send_cluster_message(crm_get_peer(0, on_node), crm_msg_lrmd, cmd, TRUE); } free(counter); free_xml(cmd); action->executed = TRUE; if (rc == FALSE) { crm_err("Action %d failed: send", action->id); return FALSE; } else if (no_wait) { action->confirmed = TRUE; update_graph(transition_graph, action); trigger_graph(); } else { if (action->timeout <= 0) { crm_err("Action %d: %s %s on %s had an invalid timeout (%dms). Using %dms instead", action->id, task, task_uuid, on_node, action->timeout, graph->network_delay); action->timeout = graph->network_delay; } te_start_action_timer(graph, action); } value = crm_meta_value(action->params, XML_OP_ATTR_PENDING); - if (crm_is_true(value)) { + if (crm_is_true(value) && safe_str_neq(task, CRMD_ACTION_CANCEL)) { /* write a "pending" entry to the CIB, inhibit notification */ - crm_info("Recording pending op %s in the CIB", task_uuid); + crm_debug("Recording pending op %s in the CIB", task_uuid); cib_action_update(action, PCMK_LRM_OP_PENDING, PCMK_EXECRA_STATUS_UNKNOWN); } return TRUE; } crm_graph_functions_t te_graph_fns = { te_pseudo_action, te_rsc_command, te_crm_command, te_fence_node }; void notify_crmd(crm_graph_t * graph) { const char *type = "unknown"; enum crmd_fsa_input event = I_NULL; crm_debug("Processing transition completion in state %s", fsa_state2string(fsa_state)); CRM_CHECK(graph->complete, graph->complete = TRUE); switch (graph->completion_action) { case tg_stop: type = "stop"; /* fall through */ case tg_done: type = "done"; if (fsa_state == S_TRANSITION_ENGINE) { event = I_TE_SUCCESS; } break; case tg_restart: type = "restart"; if (fsa_state == S_TRANSITION_ENGINE) { if (transition_timer->period_ms > 0) { crm_timer_stop(transition_timer); crm_timer_start(transition_timer); } else if(too_many_st_failures() == FALSE) { event = I_PE_CALC; } } else if (fsa_state == S_POLICY_ENGINE) { register_fsa_action(A_PE_INVOKE); } break; case tg_shutdown: type = "shutdown"; if (is_set(fsa_input_register, R_SHUTDOWN)) { event = I_STOP; } else { crm_err("We didn't ask to be shut down, yet our" " PE is telling us too."); event = I_TERMINATE; } } crm_debug( "Transition %d status: %s - %s", graph->id, type, crm_str(graph->abort_reason)); graph->abort_reason = NULL; graph->completion_action = tg_done; clear_bit(fsa_input_register, R_IN_TRANSITION); if (event != I_NULL) { register_fsa_input(C_FSA_INTERNAL, event, NULL); } else if (fsa_source) { mainloop_set_trigger(fsa_source); } } diff --git a/cts/CIB.py b/cts/CIB.py index 33d63c97a7..4767b70928 100644 --- a/cts/CIB.py +++ b/cts/CIB.py @@ -1,613 +1,608 @@ '''CTS: Cluster Testing System: CIB generator ''' __copyright__=''' Author: Andrew Beekhof Copyright (C) 2008 Andrew Beekhof ''' from UserDict import UserDict import sys, time, types, syslog, os, struct, string, signal, traceback, warnings, socket from cts.CTSvars import * from cts.CTS import ClusterManager class CibBase: cts_cib = None version = "unknown" feature_set = "unknown" Factory = None def __init__(self, CM, factory, tmpfile=None): self.CM = CM self.Factory = factory if not tmpfile: warnings.filterwarnings("ignore") tmpfile=os.tmpnam() warnings.resetwarnings() self.Factory.tmpfile = tmpfile def version(self): return self.version def NextIP(self): fields = string.split(self.CM.Env["IPBase"], '.') fields[3] = str(int(fields[3])+1) ip = string.join(fields, '.') self.CM.Env["IPBase"] = ip return ip class CibXml: def __init__(self, Factory, tag, _id, **kwargs): self.tag = tag self.name = _id self.kwargs = kwargs self.children = [] self.Factory = Factory def add_child(self, child): self.children.append(child) def __setitem__(self, key, value): self.kwargs[key] = value def show(self): text = '''<%s''' % self.tag if self.name: text += ''' id="%s"''' % (self.name) for k in self.kwargs.keys(): text += ''' %s="%s"''' % (k, self.kwargs[k]) if not self.children: text += '''/>''' return text text += '''>''' for c in self.children: text += c.show() text += '''''' % self.tag return text def _run(self, operation, xml, section="all", options=""): self.Factory.debug("Writing out %s" % self.name) fixed = "HOME=/root CIB_file="+self.Factory.tmpfile fixed += " cibadmin --%s --scope %s %s --xml-text '%s'" % (operation, section, options, xml) rc = self.Factory.rsh(self.Factory.target, fixed) if rc != 0: self.Factory.log("Configure call failed: "+fixed) sys.exit(1) class FencingTopology(CibXml): def __init__(self, Factory): CibXml.__init__(self, Factory, "fencing-topology", None) def level(self, index, node, devices): self.add_child(CibXml(self.Factory, "fencing-level", "%s.%d" % (node, index), target=node, index=index, devices=devices)) def commit(self): self._run("create", self.show(), "configuration", "--allow-create") class Option(CibXml): def __init__(self, Factory, name=None, value=None, section="cib-bootstrap-options"): CibXml.__init__(self, Factory, "cluster_property_set", section) if name and value: self.add_child(CibXml(Factory, "nvpair", "cts-%s" % name, name=name, value=value)) def __setitem__(self, key, value): self.add_child(CibXml(self.Factory, "nvpair", "cts-%s" % key, name=key, value=value)) def commit(self): self._run("modify", self.show(), "crm_config", "--allow-create") class Expression(CibXml): def __init__(self, Factory, name, attr, op, value=None): CibXml.__init__(self, Factory, "expression", name, attribute=attr, operation=op) if value: self["value"] = value class Rule(CibXml): def __init__(self, Factory, name, score, op="and", expr=None): CibXml.__init__(self, Factory, "rule", "%s" % name) self["boolean-op"] = op self["score"] = score if expr: self.add_child(expr) class Resource(CibXml): def __init__(self, Factory, name, rtype, standard, provider=None): CibXml.__init__(self, Factory, "native", name) self.rtype = rtype self.standard = standard self.provider = provider self.op=[] self.meta={} self.param={} self.scores={} self.needs={} self.coloc={} if self.standard == "ocf" and not provider: self.provider = "heartbeat" elif self.standard == "lsb": self.provider = None def __setitem__(self, key, value): self.add_param(key, value) def add_op(self, name, interval, **kwargs): self.op.append( CibXml(self.Factory, "op", "%s-%s" % (name, interval), name=name, interval=interval, **kwargs)) def add_param(self, name, value): self.param[name] = value def add_meta(self, name, value): self.meta[name] = value def prefer(self, node, score="INFINITY", rule=None): if not rule: rule = Rule(self.Factory, "prefer-%s-r" % node, score, expr=Expression(self.Factory, "prefer-%s-e" % node, "#uname", "eq", node)) self.scores[node] = rule def after(self, resource, kind="Mandatory", first="start", then="start", **kwargs): kargs = kwargs.copy() kargs["kind"] = kind if then: kargs["first-action"] = "start" kargs["then-action"] = then if first: kargs["first-action"] = first self.needs[resource] = kargs def colocate(self, resource, score="INFINITY", role=None, withrole=None, **kwargs): kargs = kwargs.copy() kargs["score"] = score if role: kargs["rsc-role"] = role if withrole: kargs["with-rsc-role"] = withrole self.coloc[resource] = kargs def constraints(self): text = "" for k in self.scores.keys(): text += '''''' % (k, self.name) text += self.scores[k].show() text += '''''' for k in self.needs.keys(): text += '''''' for k in self.coloc.keys(): text += '''''' text += "" return text def show(self): text = '''''' if len(self.meta) > 0: text += '''''' % self.name for p in self.meta.keys(): text += '''''' % (self.name, p, p, self.meta[p]) text += '''''' if len(self.param) > 0: text += '''''' % self.name for p in self.param.keys(): text += '''''' % (self.name, p, p, self.param[p]) text += '''''' if len(self.op) > 0: text += '''''' for o in self.op: key = o.name o.name = "%s-%s" % (self.name, key) text += o.show() o.name = key text += '''''' text += '''''' return text def commit(self): self._run("create", self.show(), "resources") self._run("modify", self.constraints()) class Group(Resource): def __init__(self, Factory, name): Resource.__init__(self, Factory, name, None, None) self.tag = "group" def __setitem__(self, key, value): self.add_meta(key, value) def show(self): text = '''<%s id="%s">''' % (self.tag, self.name) if len(self.meta) > 0: text += '''''' % self.name for p in self.meta.keys(): text += '''''' % (self.name, p, p, self.meta[p]) text += '''''' for c in self.children: text += c.show() text += '''''' % self.tag return text class Clone(Group): def __init__(self, Factory, name, child=None): Group.__init__(self, Factory, name) self.tag = "clone" if child: self.add_child(child) def add_child(self, resource): if not self.children: self.children.append(resource) else: self.Factory.log("Clones can only have a single child. Ignoring %s" % resource.name) class Master(Clone): def __init__(self, Factory, name, child=None): Clone.__init__(self, Factory, name, child) self.tag = "master" class CIB11(CibBase): feature_set = "3.0" version = "pacemaker-1.1" def _show(self, command=""): output = "" (rc, result) = self.Factory.rsh(self.Factory.target, "HOME=/root CIB_file="+self.Factory.tmpfile+" cibadmin -Ql "+command, None, ) for line in result: output += line self.Factory.debug("Generated Config: "+line) return output def NewIP(self, name=None, standard="ocf"): ip = self.NextIP() if not name: name = "r"+ip r = Resource(self.Factory, name, "IPaddr2", standard) r["ip"] = ip r["cidr_netmask"] = "32" r.add_op("monitor", "5s") return r def install(self, target): old = self.Factory.tmpfile # Force a rebuild self.cts_cib = None self.Factory.tmpfile = CTSvars.CRM_CONFIG_DIR+"/cib.xml" self.contents(target) self.Factory.rsh(self.Factory.target, "chown "+CTSvars.CRM_DAEMON_USER+" "+self.Factory.tmpfile) self.Factory.tmpfile = old def contents(self, target=None): # fencing resource if self.cts_cib: return self.cts_cib if target: self.Factory.target = target self.Factory.rsh(self.Factory.target, "HOME=/root cibadmin --empty %s > %s" % (self.version, self.Factory.tmpfile)) #cib_base = self.cib_template % (self.feature_set, self.version, ''' remote-tls-port='9898' remote-clear-port='9999' ''') nodelist = "" self.num_nodes = 0 for node in self.CM.Env["nodes"]: nodelist += node + " " self.num_nodes = self.num_nodes + 1 no_quorum = "stop" if self.num_nodes < 3: no_quorum = "ignore" self.Factory.log("Cluster only has %d nodes, configuring: no-quroum-policy=ignore" % self.num_nodes) # Fencing resource # Define first so that the shell doesn't reject every update if self.CM.Env["DoFencing"]: st = Resource(self.Factory, "Fencing", self.CM.Env["stonith-type"], "stonith") # Set a threshold for unreliable stonith devices such as the vmware one st.add_meta("migration-threshold", "5") st.add_op("monitor", "120s", timeout="300s") st.add_op("stop", "0", timeout="180s") st.add_op("start", "0", timeout="180s") entries = string.split(self.CM.Env["stonith-params"], ',') for entry in entries: (name, value) = string.split(entry, '=') if name == "hostlist" and value == "all": value = string.join(self.CM.Env["nodes"], " ") elif name == "pcmk_host_list" and value == "all": value = string.join(self.CM.Env["nodes"], " ") st[name] = value st.commit() # Test advanced fencing logic if True: stf_nodes = [] stt_nodes = [] - # Cheat to create a second copy of the real device - st.name = "FencingDup" - st.colocate("Fencing", "-INFINITY") - st.commit() - # Create the levels stl = FencingTopology(self.Factory) for node in self.CM.Env["nodes"]: ftype = self.CM.Env.RandomGen.choice(["levels-and", "levels-or ", "broadcast "]) self.CM.log(" - Using %s fencing for node: %s" % (ftype, node)) if ftype == "levels-and": stl.level(1, node, "FencingPass,Fencing") stt_nodes.append(node) elif ftype == "levels-or ": stl.level(1, node, "FencingFail") stl.level(2, node, "Fencing") stf_nodes.append(node) # Create a Dummy agent that always passes for levels-and if len(stt_nodes): ftype="fence_true" self.CM.install_helper(ftype, destdir="/usr/sbin") stt = Resource(self.Factory, "FencingPass", ftype, "stonith") stt["pcmk_host_list"] = string.join(stt_nodes, " ") # Wait this many seconds before doing anything, handy for letting disks get flushed too stt["power_timeout"] = "20" stt.commit() # Create a Dummy agent that always fails for levels-or if len(stf_nodes): ftype="fence_false" self.CM.install_helper(ftype, destdir="/usr/sbin") stf = Resource(self.Factory, "FencingFail", ftype, "stonith") stf["pcmk_host_list"] = string.join(stf_nodes, " ") # Wait this many seconds before doing anything, handy for letting disks get flushed too stf["power_timeout"] = "20" stf.commit() # Now commit the levels themselves stl.commit() o = Option(self.Factory, "stonith-enabled", self.CM.Env["DoFencing"]) o["start-failure-is-fatal"] = "false" o["pe-input-series-max"] = "5000" o["default-action-timeout"] = "90s" o["shutdown-escalation"] = "5min" o["batch-limit"] = "10" o["dc-deadtime"] = "5s" o["no-quorum-policy"] = no_quorum o["expected-quorum-votes"] = self.num_nodes if self.CM.Env["DoBSC"] == 1: o["ident-string"] = "Linux-HA TEST configuration file - REMOVEME!!" o.commit() # Add resources? if self.CM.Env["CIBResource"] == 1: self.add_resources() if self.CM.cluster_monitor == 1: mon = Resource(self.Factory, "cluster_mon", "ocf", "ClusterMon", "pacemaker") mon.add_op("start", "0", requires="nothing") mon.add_op("monitor", "5s", requires="nothing") mon["update"] = "10" mon["extra_options"] = "-r -n" mon["user"] = "abeekhof" mon["htmlfile"] = "/suse/abeekhof/Export/cluster.html" mon.commit() #self._create('''location prefer-dc cluster_mon rule -INFINITY: \#is_dc eq false''') # generate cib self.cts_cib = self._show() if self.Factory.tmpfile != CTSvars.CRM_CONFIG_DIR+"/cib.xml": self.Factory.rsh(self.Factory.target, "rm -f "+self.Factory.tmpfile) return self.cts_cib def add_resources(self): # Per-node resources for node in self.CM.Env["nodes"]: name = "rsc_"+node r = self.NewIP(name) r.prefer(node, "100") r.commit() # Migrator # Make this slightly sticky (since we have no other location constraints) to avoid relocation during Reattach m = Resource(self.Factory, "migrator","Dummy", "ocf", "pacemaker") m.add_meta("resource-stickiness","1") m.add_meta("allow-migrate", "1") m.add_op("monitor", "P10S") m.commit() # Ping the test master p = Resource(self.Factory, "ping-1","ping", "ocf", "pacemaker") p.add_op("monitor", "60s") p["host-list"] = self.CM.Env["cts-master"] p["name"] = "connected" p["debug"] = "true" c = Clone(self.Factory, "Connectivity", p) c["globally-unique"] = "false" c.commit() #master slave resource s = Resource(self.Factory, "stateful-1", "Stateful", "ocf", "pacemaker") s.add_op("monitor", "15s", timeout="60s") s.add_op("monitor", "16s", timeout="60s", role="Master") ms = Master(self.Factory, "master-1", s) ms["clone-max"] = self.num_nodes ms["master-max"] = 1 ms["clone-node-max"] = 1 ms["master-node-max"] = 1 # Require conectivity to run the master r = Rule(self.Factory, "connected", "-INFINITY", op="or") r.add_child(Expression(self.Factory, "m1-connected-1", "connected", "lt", "1")) r.add_child(Expression(self.Factory, "m1-connected-2", "connected", "not_defined", None)) ms.prefer("connected", rule=r) ms.commit() # Group Resource g = Group(self.Factory, "group-1") g.add_child(self.NewIP()) g.add_child(self.NewIP()) g.add_child(self.NewIP()) # Group with the master g.after("master-1", first="promote", then="start") g.colocate("master-1", "INFINITY", withrole="Master") g.commit() # LSB resource lsb_agent = self.CM.install_helper("LSBDummy") lsb = Resource(self.Factory, "lsb-dummy",lsb_agent, "lsb") lsb.add_op("monitor", "5s") # LSB with group lsb.after("group-1") lsb.colocate("group-1") lsb.commit() class CIB12(CIB11): feature_set = "3.0" version = "pacemaker-1.2" #class HASI(CIB10): # def add_resources(self): # # DLM resource # self._create('''primitive dlm ocf:pacemaker:controld op monitor interval=120s''') # self._create('''clone dlm-clone dlm meta globally-unique=false interleave=true''') # O2CB resource # self._create('''primitive o2cb ocf:ocfs2:o2cb op monitor interval=120s''') # self._create('''clone o2cb-clone o2cb meta globally-unique=false interleave=true''') # self._create('''colocation o2cb-with-dlm INFINITY: o2cb-clone dlm-clone''') # self._create('''order start-o2cb-after-dlm mandatory: dlm-clone o2cb-clone''') class ConfigFactory: def __init__(self, CM): self.CM = CM self.rsh = self.CM.rsh self.register("pacemaker11", CIB11, CM, self) self.register("pacemaker12", CIB12, CM, self) # self.register("hae", HASI, CM, self) self.target = self.CM.Env["nodes"][0] self.tmpfile = None def log(self, args): self.CM.log("cib: %s" % args) def debug(self, args): self.CM.debug("cib: %s" % args) def register(self, methodName, constructor, *args, **kargs): """register a constructor""" _args = [constructor] _args.extend(args) setattr(self, methodName, apply(ConfigFactoryItem,_args, kargs)) def unregister(self, methodName): """unregister a constructor""" delattr(self, methodName) def createConfig(self, name="pacemaker-1.0"): if name == "pacemaker-1.0": name = "pacemaker10"; elif name == "pacemaker-1.1": name = "pacemaker11"; elif name == "pacemaker-1.2": name = "pacemaker12"; elif name == "hasi": name = "hae"; if hasattr(self, name): return getattr(self, name)() else: self.CM.log("Configuration variant '%s' is unknown. Defaulting to latest config" % name) return self.pacemaker12() class ConfigFactoryItem: def __init__(self, function, *args, **kargs): assert callable(function), "function should be a callable obj" self._function = function self._args = args self._kargs = kargs def __call__(self, *args, **kargs): """call function""" _args = list(self._args) _args.extend(args) _kargs = self._kargs.copy() _kargs.update(kargs) return apply(self._function,_args,_kargs) # Basic Sanity Testing if __name__ == '__main__': import CTSlab env = CTSlab.LabEnvironment() env["nodes"] = [] env["nodes"].append("pcmk-1") env["nodes"].append("pcmk-2") env["nodes"].append("pcmk-3") env["nodes"].append("pcmk-4") env["CIBResource"] = 1 env["IPBase"] = "10.0.0.10" env["DoStonith"]=1 env["stonith-type"] = "fence_xvm" env["stonith-params"] = "pcmk_arg_map=domain:uname" manager = ClusterManager(env) manager.cluster_monitor = False CibFactory = ConfigFactory(manager) cib = CibFactory.createConfig("pacemaker-1.1") print cib.contents() diff --git a/cts/CTStests.py b/cts/CTStests.py index 060d722ce3..73b6b20ea1 100644 --- a/cts/CTStests.py +++ b/cts/CTStests.py @@ -1,2291 +1,2296 @@ '''CTS: Cluster Testing System: Tests module There are a few things we want to do here: ''' __copyright__=''' Copyright (C) 2000, 2001 Alan Robertson Licensed under the GNU GPL. Add RecourceRecover testcase Zhao Kai ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. # # SPECIAL NOTE: # # Tests may NOT implement any cluster-manager-specific code in them. # EXTEND the ClusterManager object to provide the base capabilities # the test needs if you need to do something that the current CM classes # do not. Otherwise you screw up the whole point of the object structure # in CTS. # # Thank you. # import time, os, re, types, string, tempfile, sys from stat import * from cts import CTS from cts.CTSaudits import * AllTestClasses = [ ] class CTSTest: ''' A Cluster test. We implement the basic set of properties and behaviors for a generic cluster test. Cluster tests track their own statistics. We keep each of the kinds of counts we track as separate {name,value} pairs. ''' def __init__(self, cm): #self.name="the unnamed test" self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} # if not issubclass(cm.__class__, ClusterManager): # raise ValueError("Must be a ClusterManager object") self.CM = cm self.Audits = [] self.timeout=120 self.passed = 1 self.is_loop = 0 self.is_unsafe = 0 self.is_experimental = 0 self.is_valgrind = 0 self.benchmark = 0 # which tests to benchmark self.timer = {} # timers def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def log_mark(self, msg): self.CM.debug("MARK: test %s %s %d" % (self.name,msg,time.time())) return def get_timer(self,key = "test"): try: return self.timer[key] except: return 0 def set_timer(self,key = "test"): self.timer[key] = time.time() return self.timer[key] def log_timer(self,key = "test"): elapsed = 0 if key in self.timer: elapsed = time.time() - self.timer[key] s = key == "test" and self.name or "%s:%s" %(self.name,key) self.CM.debug("%s runtime: %.2f" % (s, elapsed)) del self.timer[key] return elapsed def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 # Reset the test passed boolean if name == "calls": self.passed = 1 def failure(self, reason="none"): '''Increment the failure count''' self.passed = 0 self.incr("failure") self.CM.log(("Test %s" % self.name).ljust(35) +" FAILED: %s" % reason) return None def success(self): '''Increment the success count''' self.incr("success") return 1 def skipped(self): '''Increment the skipped count''' self.incr("skipped") return 1 def __call__(self, node): '''Perform the given test''' raise ValueError("Abstract Class member (__call__)") self.incr("calls") return self.failure() def audit(self): passed = 1 if len(self.Audits) > 0: for audit in self.Audits: if not audit(): self.CM.log("Internal %s Audit %s FAILED." % (self.name, audit.name())) self.incr("auditfail") passed = 0 return passed def setup(self, node): '''Setup the given test''' return self.success() def teardown(self, node): '''Tear down the given test''' return self.success() def create_watch(self, patterns, timeout, name=None): if not name: name = self.name return CTS.LogWatcher(self.CM.Env, self.CM["LogFileName"], patterns, name, timeout) def local_badnews(self, prefix, watch, local_ignore=[]): errcount = 0 if not prefix: prefix = "LocalBadNews:" ignorelist = [] ignorelist.append(" CTS: ") ignorelist.append(prefix) ignorelist.extend(local_ignore) while errcount < 100: match=watch.look(0) if match: add_err = 1 for ignore in ignorelist: if add_err == 1 and re.search(ignore, match): add_err = 0 if add_err == 1: self.CM.log(prefix + " " + match) errcount=errcount+1 else: break else: self.CM.log("Too many errors!") return errcount def is_applicable(self): return self.is_applicable_common() def is_applicable_common(self): '''Return TRUE if we are applicable in the current test configuration''' #raise ValueError("Abstract Class member (is_applicable)") if self.is_loop and not self.CM.Env["loop-tests"]: return 0 elif self.is_unsafe and not self.CM.Env["unsafe-tests"]: return 0 elif self.is_valgrind and not self.CM.Env["valgrind-tests"]: return 0 elif self.is_experimental and not self.CM.Env["experimental-tests"]: return 0 elif self.CM.Env["benchmark"] and self.benchmark == 0: return 0 return 1 def find_ocfs2_resources(self, node): self.r_o2cb = None self.r_ocfs2 = [] (rc, lines) = self.CM.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rtype == "o2cb" and r.parent != "NA": self.CM.debug("Found o2cb: %s" % self.r_o2cb) self.r_o2cb = r.parent if re.search("^Constraint", line): c = AuditConstraint(self.CM, line) if c.type == "rsc_colocation" and c.target == self.r_o2cb: self.r_ocfs2.append(c.rsc) self.CM.debug("Found ocfs2 filesystems: %s" % repr(self.r_ocfs2)) return len(self.r_ocfs2) def canrunnow(self, node): '''Return TRUE if we can meaningfully run right now''' return 1 def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [] ################################################################### class StopTest(CTSTest): ################################################################### '''Stop (deactivate) the cluster manager on a node''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name="Stop" def __call__(self, node): '''Perform the 'stop' test. ''' self.incr("calls") if self.CM.ShouldBeStatus[node] != "up": return self.skipped() patterns = [] # Technically we should always be able to notice ourselves stopping patterns.append(self.CM["Pat:We_stopped"] % node) #if self.CM.Env["use_logd"]: # patterns.append(self.CM["Pat:Logd_stopped"] % node) # Any active node needs to notice this one left # NOTE: This wont work if we have multiple partitions for other in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[other] == "up" and other != node: patterns.append(self.CM["Pat:They_stopped"] %(other, self.CM.key_for_node(node))) #self.debug("Checking %s will notice %s left"%(other, node)) watch = self.create_watch(patterns, self.CM["DeadTime"]) watch.setwatch() if node == self.CM.OurNode: self.incr("us") else: if self.CM.upcount() <= 1: self.incr("all") else: self.incr("them") self.CM.StopaCM(node) watch_result = watch.lookforall() failreason=None UnmatchedList = "||" if watch.unmatched: (rc, output) = self.CM.rsh(node, "/bin/ps axf", None) for line in output: self.CM.debug(line) (rc, output) = self.CM.rsh(node, "/usr/sbin/dlm_tool dump", None) for line in output: self.CM.debug(line) for regex in watch.unmatched: self.CM.log ("ERROR: Shutdown pattern not found: %s" % (regex)) UnmatchedList += regex + "||"; failreason="Missing shutdown pattern" self.CM.cluster_stable(self.CM["DeadTime"]) if not watch.unmatched or self.CM.upcount() == 0: return self.success() if len(watch.unmatched) >= self.CM.upcount(): return self.failure("no match against (%s)" % UnmatchedList) if failreason == None: return self.success() else: return self.failure(failreason) # # We don't register StopTest because it's better when called by # another test... # ################################################################### class StartTest(CTSTest): ################################################################### '''Start (activate) the cluster manager on a node''' def __init__(self, cm, debug=None): CTSTest.__init__(self,cm) self.name="start" self.debug = debug def __call__(self, node): '''Perform the 'start' test. ''' self.incr("calls") if self.CM.upcount() == 0: self.incr("us") else: self.incr("them") if self.CM.ShouldBeStatus[node] != "down": return self.skipped() elif self.CM.StartaCM(node): return self.success() else: return self.failure("Startup %s on node %s failed" %(self.CM["Name"], node)) # # We don't register StartTest because it's better when called by # another test... # ################################################################### class FlipTest(CTSTest): ################################################################### '''If it's running, stop it. If it's stopped start it. Overthrow the status quo... ''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Flip" self.start = StartTest(cm) self.stop = StopTest(cm) def __call__(self, node): '''Perform the 'Flip' test. ''' self.incr("calls") if self.CM.ShouldBeStatus[node] == "up": self.incr("stopped") ret = self.stop(node) type="up->down" # Give the cluster time to recognize it's gone... time.sleep(self.CM["StableTime"]) elif self.CM.ShouldBeStatus[node] == "down": self.incr("started") ret = self.start(node) type="down->up" else: return self.skipped() self.incr(type) if ret: return self.success() else: return self.failure("%s failure" % type) # Register FlipTest as a good test to run AllTestClasses.append(FlipTest) ################################################################### class RestartTest(CTSTest): ################################################################### '''Stop and restart a node''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Restart" self.start = StartTest(cm) self.stop = StopTest(cm) self.benchmark = 1 def __call__(self, node): '''Perform the 'restart' test. ''' self.incr("calls") self.incr("node:" + node) ret1 = 1 if self.CM.StataCM(node): self.incr("WasStopped") if not self.start(node): return self.failure("start (setup) failure: "+node) self.set_timer() if not self.stop(node): return self.failure("stop failure: "+node) if not self.start(node): return self.failure("start failure: "+node) return self.success() # Register RestartTest as a good test to run AllTestClasses.append(RestartTest) ################################################################### class StonithdTest(CTSTest): ################################################################### def __init__(self, cm): CTSTest.__init__(self, cm) self.name="Stonithd" self.startall = SimulStartLite(cm) self.benchmark = 1 def __call__(self, node): self.incr("calls") if len(self.CM.Env["nodes"]) < 2: return self.skipped() ret = self.startall(None) if not ret: return self.failure("Setup failed") is_dc = self.CM.is_node_dc(node) watchpats = [] watchpats.append("log_operation: Operation .* for host '%s' with device .* returned: 0" % node) watchpats.append("tengine_stonith_notify: Peer %s was terminated .*: OK" % node) if self.CM.Env["at-boot"] == 0: self.CM.debug("Expecting %s to stay down" % node) self.CM.ShouldBeStatus[node]="down" else: self.CM.debug("Expecting %s to come up again %d" % (node, self.CM.Env["at-boot"])) watchpats.append("%s .*do_state_transition: .* S_STARTING -> S_PENDING" % node) watchpats.append("%s .*do_state_transition: .* S_PENDING -> S_NOT_DC" % node) watch = self.create_watch(watchpats, 30 + self.CM["DeadTime"] + self.CM["StableTime"] + self.CM["StartTime"]) watch.setwatch() origin = self.CM.Env.RandomGen.choice(self.CM.Env["nodes"]) rc = self.CM.rsh(origin, "stonith_admin --reboot %s -VVVVVV" % node) if rc == 194: # 194 - 256 = -62 = Timer expired # # Look for the patterns, usually this means the required # device was running on the node to be fenced - or that # the required devices were in the process of being loaded # and/or moved # # Effectively the node committed suicide so there will be # no confirmation, but pacemaker should be watching and # fence the node again self.CM.log("Fencing command on %s to fence %s timed out" % (origin, node)) elif origin != node and rc != 0: self.CM.debug("Waiting for the cluster to recover") self.CM.cluster_stable() self.CM.debug("Waiting STONITHd node to come back up") self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"], 600) self.CM.log("Fencing command on %s failed to fence %s (rc=%d)" % (origin, node, rc)) elif origin == node and rc != 255: # 255 == broken pipe, ie. the node was fenced as epxected self.CM.log("Logcally originated fencing returned %d" % rc) self.set_timer("fence") matched = watch.lookforall() self.log_timer("fence") self.set_timer("reform") if watch.unmatched: self.CM.log("Patterns not found: " + repr(watch.unmatched)) self.CM.debug("Waiting for the cluster to recover") self.CM.cluster_stable() self.CM.debug("Waiting STONITHd node to come back up") self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"], 600) self.CM.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self.CM.cluster_stable(self.CM["StartTime"]) if not matched: return self.failure("Didn't find all expected patterns") elif not is_stable: return self.failure("Cluster did not become stable") self.log_timer("reform") return self.success() def errorstoignore(self): return [ self.CM["Pat:We_fenced"] % ".*", self.CM["Pat:They_fenced"] % ".*", "error: native_create_actions: Resource .*stonith::.* is active on 2 nodes attempting recovery", "error: remote_op_done: Operation reboot of .*by .* for stonith_admin.*: Timer expired", ] def is_applicable(self): if not self.is_applicable_common(): return 0 if self.CM.Env.has_key("DoFencing"): return self.CM.Env["DoFencing"] return 1 AllTestClasses.append(StonithdTest) ################################################################### class StartOnebyOne(CTSTest): ################################################################### '''Start all the nodes ~ one by one''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="StartOnebyOne" self.stopall = SimulStopLite(cm) self.start = StartTest(cm) self.ns=CTS.NodeStatus(cm.Env) def __call__(self, dummy): '''Perform the 'StartOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Test setup failed") failed=[] self.set_timer() for node in self.CM.Env["nodes"]: if not self.start(node): failed.append(node) if len(failed) > 0: return self.failure("Some node failed to start: " + repr(failed)) return self.success() # Register StartOnebyOne as a good test to run AllTestClasses.append(StartOnebyOne) ################################################################### class SimulStart(CTSTest): ################################################################### '''Start all the nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SimulStart" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) def __call__(self, dummy): '''Perform the 'SimulStart' test. ''' self.incr("calls") # We ignore the "node" parameter... # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Setup failed") self.CM.clear_all_caches() if not self.startall(None): return self.failure("Startall failed") return self.success() # Register SimulStart as a good test to run AllTestClasses.append(SimulStart) ################################################################### class SimulStop(CTSTest): ################################################################### '''Stop all the nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SimulStop" self.startall = SimulStartLite(cm) self.stopall = SimulStopLite(cm) def __call__(self, dummy): '''Perform the 'SimulStop' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") if not self.stopall(None): return self.failure("Stopall failed") return self.success() # Register SimulStop as a good test to run AllTestClasses.append(SimulStop) ################################################################### class StopOnebyOne(CTSTest): ################################################################### '''Stop all the nodes in order''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="StopOnebyOne" self.startall = SimulStartLite(cm) self.stop = StopTest(cm) def __call__(self, dummy): '''Perform the 'StopOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") failed=[] self.set_timer() for node in self.CM.Env["nodes"]: if not self.stop(node): failed.append(node) if len(failed) > 0: return self.failure("Some node failed to stop: " + repr(failed)) self.CM.clear_all_caches() return self.success() # Register StopOnebyOne as a good test to run AllTestClasses.append(StopOnebyOne) ################################################################### class RestartOnebyOne(CTSTest): ################################################################### '''Restart all the nodes in order''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="RestartOnebyOne" self.startall = SimulStartLite(cm) def __call__(self, dummy): '''Perform the 'RestartOnebyOne' test. ''' self.incr("calls") # We ignore the "node" parameter... # Start up all the nodes... ret = self.startall(None) if not ret: return self.failure("Setup failed") did_fail=[] self.set_timer() self.restart = RestartTest(self.CM) for node in self.CM.Env["nodes"]: if not self.restart(node): did_fail.append(node) if did_fail: return self.failure("Could not restart %d nodes: %s" %(len(did_fail), repr(did_fail))) return self.success() # Register StopOnebyOne as a good test to run AllTestClasses.append(RestartOnebyOne) ################################################################### class PartialStart(CTSTest): ################################################################### '''Start a node - but tell it to stop before it finishes starting up''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="PartialStart" self.startall = SimulStartLite(cm) self.stopall = SimulStopLite(cm) self.stop = StopTest(cm) #self.is_unsafe = 1 def __call__(self, node): '''Perform the 'PartialStart' test. ''' self.incr("calls") ret = self.stopall(None) if not ret: return self.failure("Setup failed") # FIXME! This should use the CM class to get the pattern # then it would be applicable in general watchpats = [] watchpats.append("crmd.*Connecting to cluster infrastructure") watch = self.create_watch(watchpats, self.CM["DeadTime"]+10) watch.setwatch() self.CM.StartaCMnoBlock(node) ret = watch.lookforall() if not ret: self.CM.log("Patterns not found: " + repr(watch.unmatched)) return self.failure("Setup of %s failed" % node) ret = self.stop(node) if not ret: return self.failure("%s did not stop in time" % node) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' # We might do some fencing in the 2-node case if we make it up far enough return [ """Executing reboot fencing operation""" ] # Register StopOnebyOne as a good test to run AllTestClasses.append(PartialStart) ####################################################################### class StandbyTest(CTSTest): ####################################################################### def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Standby" self.benchmark = 1 self.start = StartTest(cm) self.startall = SimulStartLite(cm) # make sure the node is active # set the node to standby mode # check resources, none resource should be running on the node # set the node to active mode # check resouces, resources should have been migrated back (SHOULD THEY?) def __call__(self, node): self.incr("calls") ret=self.startall(None) if not ret: return self.failure("Start all nodes failed") self.CM.debug("Make sure node %s is active" % node) if self.CM.StandbyStatus(node) != "off": if not self.CM.SetStandbyMode(node, "off"): return self.failure("can't set node %s to active mode" % node) self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "off": return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status)) self.CM.debug("Getting resources running on node %s" % node) rsc_on_node = self.CM.active_resources(node) watchpats = [] watchpats.append("do_state_transition:.*-> S_POLICY_ENGINE") watch = self.create_watch(watchpats, self.CM["DeadTime"]+10) watch.setwatch() self.CM.debug("Setting node %s to standby mode" % node) if not self.CM.SetStandbyMode(node, "on"): return self.failure("can't set node %s to standby mode" % node) self.set_timer("on") ret = watch.lookforall() if not ret: self.CM.log("Patterns not found: " + repr(watch.unmatched)) self.CM.SetStandbyMode(node, "off") return self.failure("cluster didn't react to standby change on %s" % node) self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "on": return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status)) self.log_timer("on") self.CM.debug("Checking resources") bad_run = self.CM.active_resources(node) if len(bad_run) > 0: rc = self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run))) self.CM.debug("Setting node %s to active mode" % node) self.CM.SetStandbyMode(node, "off") return rc self.CM.debug("Setting node %s to active mode" % node) if not self.CM.SetStandbyMode(node, "off"): return self.failure("can't set node %s to active mode" % node) self.set_timer("off") self.CM.cluster_stable() status = self.CM.StandbyStatus(node) if status != "off": return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status)) self.log_timer("off") return self.success() AllTestClasses.append(StandbyTest) ####################################################################### class ValgrindTest(CTSTest): ####################################################################### '''Check for memory leaks''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Valgrind" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) self.is_valgrind = 1 self.is_loop = 1 def setup(self, node): self.incr("calls") ret=self.stopall(None) if not ret: return self.failure("Stop all nodes failed") # Enable valgrind self.logPat = "/tmp/%s-*.valgrind" % self.name self.CM.Env["valgrind-prefix"] = self.name self.CM.rsh(node, "rm -f %s" % self.logPat, None) ret=self.startall(None) if not ret: return self.failure("Start all nodes failed") for node in self.CM.Env["nodes"]: (rc, output) = self.CM.rsh(node, "ps u --ppid `pidofproc aisexec`", None) for line in output: self.CM.debug(line) return self.success() def teardown(self, node): # Disable valgrind self.CM.Env["valgrind-prefix"] = None # Return all nodes to normal ret=self.stopall(None) if not ret: return self.failure("Stop all nodes failed") return self.success() def find_leaks(self): # Check for leaks leaked = [] self.stop = StopTest(self.CM) for node in self.CM.Env["nodes"]: (rc, ps_out) = self.CM.rsh(node, "ps u --ppid `pidofproc aisexec`", None) rc = self.stop(node) if not rc: self.failure("Couldn't shut down %s" % node) rc = self.CM.rsh(node, "grep -e indirectly.*lost:.*[1-9] -e definitely.*lost:.*[1-9] -e (ERROR|error).*SUMMARY:.*[1-9].*errors %s" % self.logPat, 0) if rc != 1: leaked.append(node) self.failure("Valgrind errors detected on %s" % node) for line in ps_out: self.CM.log(line) (rc, output) = self.CM.rsh(node, "grep -e lost: -e SUMMARY: %s" % self.logPat, None) for line in output: self.CM.log(line) (rc, output) = self.CM.rsh(node, "cat %s" % self.logPat, None) for line in output: self.CM.debug(line) self.CM.rsh(node, "rm -f %s" % self.logPat, None) return leaked def __call__(self, node): leaked = self.find_leaks() if len(leaked) > 0: return self.failure("Nodes %s leaked" % repr(leaked)) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ """cib:.*readCibXmlFile:""", """HA_VALGRIND_ENABLED""" ] ####################################################################### class StandbyLoopTest(ValgrindTest): ####################################################################### '''Check for memory leaks by putting a node in and out of standby for an hour''' def __init__(self, cm): ValgrindTest.__init__(self,cm) self.name="StandbyLoop" def __call__(self, node): lpc = 0 delay = 2 failed = 0 done=time.time() + self.CM.Env["loop-minutes"]*60 while time.time() <= done and not failed: lpc = lpc + 1 time.sleep(delay) if not self.CM.SetStandbyMode(node, "on"): self.failure("can't set node %s to standby mode" % node) failed = lpc time.sleep(delay) if not self.CM.SetStandbyMode(node, "off"): self.failure("can't set node %s to active mode" % node) failed = lpc leaked = self.find_leaks() if failed: return self.failure("Iteration %d failed" % failed) elif len(leaked) > 0: return self.failure("Nodes %s leaked" % repr(leaked)) return self.success() AllTestClasses.append(StandbyLoopTest) ############################################################################## class BandwidthTest(CTSTest): ############################################################################## # Tests should not be cluster-manager-specific # If you need to find out cluster manager configuration to do this, then # it should be added to the generic cluster manager API. '''Test the bandwidth which heartbeat uses''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name = "Bandwidth" self.start = StartTest(cm) self.__setitem__("min",0) self.__setitem__("max",0) self.__setitem__("totalbandwidth",0) self.tempfile = tempfile.mktemp(".cts") self.startall = SimulStartLite(cm) def __call__(self, node): '''Perform the Bandwidth test''' self.incr("calls") if self.CM.upcount()<1: return self.skipped() Path = self.CM.InternalCommConfig() if "ip" not in Path["mediatype"]: return self.skipped() port = Path["port"][0] port = int(port) ret = self.startall(None) if not ret: return self.failure("Test setup failed") time.sleep(5) # We get extra messages right after startup. fstmpfile = "/var/run/band_estimate" dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \ % (port, fstmpfile) rc = self.CM.rsh(node, dumpcmd) if rc == 0: farfile = "root@%s:%s" % (node, fstmpfile) self.CM.rsh.cp(farfile, self.tempfile) Bandwidth = self.countbandwidth(self.tempfile) if not Bandwidth: self.CM.log("Could not compute bandwidth.") return self.success() intband = int(Bandwidth + 0.5) self.CM.log("...bandwidth: %d bits/sec" % intband) self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth if self.Stats["min"] == 0: self.Stats["min"] = Bandwidth if Bandwidth > self.Stats["max"]: self.Stats["max"] = Bandwidth if Bandwidth < self.Stats["min"]: self.Stats["min"] = Bandwidth self.CM.rsh(node, "rm -f %s" % fstmpfile) os.unlink(self.tempfile) return self.success() else: return self.failure("no response from tcpdump command [%d]!" % rc) def countbandwidth(self, file): fp = open(file, "r") fp.seek(0) count = 0 sum = 0 while 1: line = fp.readline() if not line: return None if re.search("udp",line) or re.search("UDP,", line): count=count+1 linesplit = string.split(line," ") for j in range(len(linesplit)-1): if linesplit[j]=="udp": break if linesplit[j]=="length:": break try: sum = sum + int(linesplit[j+1]) except ValueError: self.CM.log("Invalid tcpdump line: %s" % line) return None T1 = linesplit[0] timesplit = string.split(T1,":") time2split = string.split(timesplit[2],".") time1 = (long(timesplit[0])*60+long(timesplit[1]))*60+long(time2split[0])+long(time2split[1])*0.000001 break while count < 100: line = fp.readline() if not line: return None if re.search("udp",line) or re.search("UDP,", line): count = count+1 linessplit = string.split(line," ") for j in range(len(linessplit)-1): if linessplit[j] =="udp": break if linesplit[j]=="length:": break try: sum=int(linessplit[j+1])+sum except ValueError: self.CM.log("Invalid tcpdump line: %s" % line) return None T2 = linessplit[0] timesplit = string.split(T2,":") time2split = string.split(timesplit[2],".") time2 = (long(timesplit[0])*60+long(timesplit[1]))*60+long(time2split[0])+long(time2split[1])*0.000001 time = time2-time1 if (time <= 0): return 0 return (sum*8)/time def is_applicable(self): '''BandwidthTest never applicable''' return 0 AllTestClasses.append(BandwidthTest) ################################################################### class ResourceRecover(CTSTest): ################################################################### def __init__(self, cm): CTSTest.__init__(self,cm) self.name="ResourceRecover" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.max=30 self.rid=None self.rid_alt=None #self.is_unsafe = 1 self.benchmark = 1 # these are the values used for the new LRM API call self.action = "asyncmon" self.interval = 0 def __call__(self, node): '''Perform the 'ResourceRecover' test. ''' self.incr("calls") ret = self.startall(None) if not ret: return self.failure("Setup failed") resourcelist = self.CM.active_resources(node) # if there are no resourcelist, return directly if len(resourcelist)==0: self.CM.log("No active resources on %s" % node) return self.skipped() self.rid = self.CM.Env.RandomGen.choice(resourcelist) self.rid_alt = self.rid rsc = None (rc, lines) = self.CM.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): tmp = AuditResource(self.CM, line) if tmp.id == self.rid: rsc = tmp # Handle anonymous clones that get renamed self.rid = rsc.clone_id break if not rsc: return self.failure("Could not find %s in the resource list" % self.rid) self.CM.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id)) pats = [] pats.append("Updating failcount for %s on .* after .* %s" % (self.rid, self.action)) if rsc.managed(): pats.append("process_lrm_event: LRM operation %s_stop_0.*confirmed.*ok" % self.rid) if rsc.unique(): pats.append("process_lrm_event: LRM operation %s_start_0.*confirmed.*ok" % self.rid) else: # Anonymous clones may get restarted with a different clone number pats.append("process_lrm_event: LRM operation .*_start_0.*confirmed.*ok") watch = self.create_watch(pats, 60) watch.setwatch() self.CM.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node)) self.set_timer("recover") watch.lookforall() self.log_timer("recover") self.CM.cluster_stable() recovered=self.CM.ResourceLocation(self.rid) if watch.unmatched: return self.failure("Patterns not found: %s" % repr(watch.unmatched)) elif rsc.unique() and len(recovered) > 1: return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered))) elif len(recovered) > 0: self.CM.debug("%s is running on: %s" %(self.rid, repr(recovered))) elif rsc.managed(): return self.failure("%s was not recovered and is inactive" % self.rid) return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' return [ """Updating failcount for %s""" % self.rid, """LogActions: Recover %s""" % self.rid, """LogActions: Recover %s""" % self.rid_alt, """Unknown operation: fail""", """(ERROR|error): sending stonithRA op to stonithd failed.""", """(ERROR|error): process_lrm_event: LRM operation %s_%s_%d""" % (self.rid, self.action, self.interval), """(ERROR|error): process_graph_event: Action %s_%s_%d .* initiated outside of a transition""" % (self.rid, self.action, self.interval), ] AllTestClasses.append(ResourceRecover) ################################################################### class ComponentFail(CTSTest): ################################################################### def __init__(self, cm): CTSTest.__init__(self,cm) self.name="ComponentFail" self.startall = SimulStartLite(cm) self.complist = cm.Components() self.patterns = [] self.okerrpatterns = [] self.is_unsafe = 1 def __call__(self, node): '''Perform the 'ComponentFail' test. ''' self.incr("calls") self.patterns = [] self.okerrpatterns = [] # start all nodes ret = self.startall(None) if not ret: return self.failure("Setup failed") if not self.CM.cluster_stable(self.CM["StableTime"]): return self.failure("Setup failed - unstable") node_is_dc = self.CM.is_node_dc(node, None) # select a component to kill chosen = self.CM.Env.RandomGen.choice(self.complist) while chosen.dc_only == 1 and node_is_dc == 0: chosen = self.CM.Env.RandomGen.choice(self.complist) self.CM.debug("...component %s (dc=%d,boot=%d)" % (chosen.name, node_is_dc,chosen.triggersreboot)) self.incr(chosen.name) if chosen.name != "aisexec": if self.CM["Name"] != "crm-lha" or chosen.name != "pengine": self.patterns.append(self.CM["Pat:ChildKilled"] %(node, chosen.name)) self.patterns.append(self.CM["Pat:ChildRespawn"] %(node, chosen.name)) self.patterns.extend(chosen.pats) if node_is_dc: self.patterns.extend(chosen.dc_pats) # In an ideal world, this next stuff should be in the "chosen" object as a member function if self.CM["Name"] == "crm-lha" and chosen.triggersreboot: # Make sure the node goes down and then comes back up if it should reboot... for other in self.CM.Env["nodes"]: if other != node: self.patterns.append(self.CM["Pat:They_stopped"] %(other, self.CM.key_for_node(node))) self.patterns.append(self.CM["Pat:Slave_started"] % node) self.patterns.append(self.CM["Pat:Local_started"] % node) if chosen.dc_only: # Sometimes these will be in the log, and sometimes they won't... self.okerrpatterns.append("%s .*Process %s:.* exited" %(node, chosen.name)) self.okerrpatterns.append("%s .*I_ERROR.*crmdManagedChildDied" %node) self.okerrpatterns.append("%s .*The %s subsystem terminated unexpectedly" %(node, chosen.name)) self.okerrpatterns.append("(ERROR|error): Client .* exited with return code") else: # Sometimes this won't be in the log... self.okerrpatterns.append(self.CM["Pat:ChildKilled"] %(node, chosen.name)) self.okerrpatterns.append(self.CM["Pat:ChildRespawn"] %(node, chosen.name)) self.okerrpatterns.append(self.CM["Pat:ChildExit"]) # supply a copy so self.patterns doesnt end up empty tmpPats = [] tmpPats.extend(self.patterns) self.patterns.extend(chosen.badnews_ignore) # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status stonithPats = [] stonithPats.append(self.CM["Pat:They_fenced"] % node) stonith = self.create_watch(stonithPats, 0) stonith.setwatch() # set the watch for stable watch = self.create_watch( tmpPats, self.CM["DeadTime"] + self.CM["StableTime"] + self.CM["StartTime"]) watch.setwatch() # kill the component chosen.kill(node) self.CM.debug("Waiting for the cluster to recover") self.CM.cluster_stable() self.CM.debug("Waiting for any STONITHd node to come back up") self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"], 600) self.CM.debug("Waiting for the cluster to re-stabilize with all nodes") self.CM.cluster_stable(self.CM["StartTime"]) self.CM.debug("Checking if %s was shot" % node) shot = stonith.look(60) if shot: self.CM.debug("Found: "+ repr(shot)) self.okerrpatterns.append(self.CM["Pat:We_fenced"] % node) if self.CM.Env["at-boot"] == 0: self.CM.ShouldBeStatus[node]="down" # If fencing occurred, chances are many (if not all) the expected logs # will not be sent - or will be lost when the node reboots return self.success() # check for logs indicating a graceful recovery matched = watch.lookforall(allow_multiple_matches=1) if watch.unmatched: self.CM.log("Patterns not found: " + repr(watch.unmatched)) self.CM.debug("Waiting for the cluster to re-stabilize with all nodes") is_stable = self.CM.cluster_stable(self.CM["StartTime"]) if not matched: return self.failure("Didn't find all expected patterns") elif not is_stable: return self.failure("Cluster did not become stable") return self.success() def errorstoignore(self): '''Return list of errors which should be ignored''' # Note that okerrpatterns refers to the last time we ran this test # The good news is that this works fine for us... self.okerrpatterns.extend(self.patterns) return self.okerrpatterns AllTestClasses.append(ComponentFail) #################################################################### class SplitBrainTest(CTSTest): #################################################################### '''It is used to test split-brain. when the path between the two nodes break check the two nodes both take over the resource''' def __init__(self,cm): CTSTest.__init__(self,cm) self.name = "SplitBrain" self.start = StartTest(cm) self.startall = SimulStartLite(cm) self.is_experimental = 1 def isolate_partition(self, partition): other_nodes = [] other_nodes.extend(self.CM.Env["nodes"]) for node in partition: try: other_nodes.remove(node) except ValueError: self.CM.log("Node "+node+" not in " + repr(self.CM.Env["nodes"]) + " from " +repr(partition)) if len(other_nodes) == 0: return 1 self.CM.debug("Creating partition: " + repr(partition)) self.CM.debug("Everyone else: " + repr(other_nodes)) for node in partition: if not self.CM.isolate_node(node, other_nodes): self.CM.log("Could not isolate %s" % node) return 0 return 1 def heal_partition(self, partition): other_nodes = [] other_nodes.extend(self.CM.Env["nodes"]) for node in partition: try: other_nodes.remove(node) except ValueError: self.CM.log("Node "+node+" not in " + repr(self.CM.Env["nodes"])) if len(other_nodes) == 0: return 1 self.CM.debug("Healing partition: " + repr(partition)) self.CM.debug("Everyone else: " + repr(other_nodes)) for node in partition: self.CM.unisolate_node(node, other_nodes) def __call__(self, node): '''Perform split-brain test''' self.incr("calls") self.passed = 1 partitions = {} ret = self.startall(None) if not ret: return self.failure("Setup failed") while 1: # Retry until we get multiple partitions partitions = {} p_max = len(self.CM.Env["nodes"]) for node in self.CM.Env["nodes"]: p = self.CM.Env.RandomGen.randint(1, p_max) if not partitions.has_key(p): partitions[p]= [] partitions[p].append(node) p_max = len(partitions.keys()) if p_max > 1: break # else, try again self.CM.debug("Created %d partitions" % p_max) for key in partitions.keys(): self.CM.debug("Partition["+str(key)+"]:\t"+repr(partitions[key])) # Disabling STONITH to reduce test complexity for now self.CM.rsh(node, "crm_attribute -V -n stonith-enabled -v false") for key in partitions.keys(): self.isolate_partition(partitions[key]) count = 30 while count > 0: if len(self.CM.find_partitions()) != p_max: time.sleep(10) else: break else: self.failure("Expected partitions were not created") # Target number of partitions formed - wait for stability if not self.CM.cluster_stable(): self.failure("Partitioned cluster not stable") # Now audit the cluster state self.CM.partitions_expected = p_max if not self.audit(): self.failure("Audits failed") self.CM.partitions_expected = 1 # And heal them again for key in partitions.keys(): self.heal_partition(partitions[key]) # Wait for a single partition to form count = 30 while count > 0: if len(self.CM.find_partitions()) != 1: time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not reform") # Wait for it to have the right number of members count = 30 while count > 0: members = [] partitions = self.CM.find_partitions() if len(partitions) > 0: members = partitions[0].split() if len(members) != len(self.CM.Env["nodes"]): time.sleep(10) count -= 1 else: break else: self.failure("Cluster did not completely reform") # Wait up to 20 minutes - the delay is more preferable than # trying to continue with in a messed up state if not self.CM.cluster_stable(1200): self.failure("Reformed cluster not stable") answer = raw_input('Continue? [nY]') if answer and answer == "n": raise ValueError("Reformed cluster not stable") # Turn fencing back on if self.CM.Env["DoFencing"]: self.CM.rsh(node, "crm_attribute -V -D -n stonith-enabled") self.CM.cluster_stable() if self.passed: return self.success() return self.failure("See previous errors") def errorstoignore(self): '''Return list of errors which are 'normal' and should be ignored''' return [ "Another DC detected:", "(ERROR|error): attrd_cib_callback: .*Application of an update diff failed", "crmd_ha_msg_callback:.*not in our membership list", "CRIT:.*node.*returning after partition", ] def is_applicable(self): if not self.is_applicable_common(): return 0 return len(self.CM.Env["nodes"]) > 2 AllTestClasses.append(SplitBrainTest) #################################################################### class Reattach(CTSTest): #################################################################### def __init__(self, cm): CTSTest.__init__(self,cm) self.name="Reattach" self.startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) self.is_unsafe = 0 # Handled by canrunnow() def setup(self, node): attempt=0 if not self.startall(None): return None # Make sure we are really _really_ stable and that all # resources, including those that depend on transient node # attributes, are started while not self.CM.cluster_stable(double_check=True): if attempt < 5: attempt += 1 self.CM.debug("Not stable yet, re-testing") else: self.CM.log("Cluster is not stable") return None return 1 def teardown(self, node): # Make sure 'node' is up start = StartTest(self.CM) start(node) is_managed = self.CM.rsh(node, "crm_attribute -Q -G -t crm_config -n is-managed-default -d true", 1) is_managed = is_managed[:-1] # Strip off the newline if is_managed != "true": self.CM.log("Attempting to re-enable resource management on %s (%s)" % (node, is_managed)) managed = self.create_watch(["is-managed-default"], 60) managed.setwatch() self.CM.rsh(node, "crm_attribute -V -D -n is-managed-default") if not managed.lookforall(): self.CM.log("Patterns not found: " + repr(managed.unmatched)) self.CM.log("Could not re-enable resource management") return 0 return 1 def canrunnow(self, node): '''Return TRUE if we can meaningfully run right now''' if self.find_ocfs2_resources(node): self.CM.log("Detach/Reattach scenarios are not possible with OCFS2 services present") return 0 return 1 def __call__(self, node): self.incr("calls") pats = [] managed = self.create_watch(["is-managed-default"], 60) managed.setwatch() self.CM.debug("Disable resource management") self.CM.rsh(node, "crm_attribute -V -n is-managed-default -v false") if not managed.lookforall(): self.CM.log("Patterns not found: " + repr(managed.unmatched)) return self.failure("Resource management not disabled") pats = [] pats.append("process_lrm_event: .*_stop") pats.append("process_lrm_event: .*_start") pats.append("process_lrm_event: .*_promote") pats.append("process_lrm_event: .*_demote") pats.append("process_lrm_event: .*_migrate") watch = self.create_watch(pats, 60, "ShutdownActivity") watch.setwatch() self.CM.debug("Shutting down the cluster") ret = self.stopall(None) if not ret: self.CM.debug("Re-enable resource management") self.CM.rsh(node, "crm_attribute -V -D -n is-managed-default") return self.failure("Couldn't shut down the cluster") self.CM.debug("Bringing the cluster back up") ret = self.startall(None) time.sleep(5) # allow ping to update the CIB if not ret: self.CM.debug("Re-enable resource management") self.CM.rsh(node, "crm_attribute -V -D -n is-managed-default") return self.failure("Couldn't restart the cluster") if self.local_badnews("ResourceActivity:", watch): self.CM.debug("Re-enable resource management") self.CM.rsh(node, "crm_attribute -V -D -n is-managed-default") return self.failure("Resources stopped or started during cluster restart") watch = self.create_watch(pats, 60, "StartupActivity") watch.setwatch() managed = self.create_watch(["is-managed-default"], 60) managed.setwatch() self.CM.debug("Re-enable resource management") self.CM.rsh(node, "crm_attribute -V -D -n is-managed-default") if not managed.lookforall(): self.CM.log("Patterns not found: " + repr(managed.unmatched)) return self.failure("Resource management not enabled") self.CM.cluster_stable() # Ignore actions for STONITH resources ignore = [] (rc, lines) = self.CM.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rclass == "stonith": self.CM.debug("Ignoring start actions for %s" % r.id) ignore.append("process_lrm_event: LRM operation %s_start_0.*confirmed.*ok" % r.id) if self.local_badnews("ResourceActivity:", watch, ignore): return self.failure("Resources stopped or started after resource management was re-enabled") return ret def errorstoignore(self): '''Return list of errors which should be ignored''' return [ "You may ignore this error if it is unmanaged.", "pingd: .*(ERROR|error): send_ipc_message:", "pingd: .*(ERROR|error): send_update:", "lrmd: .*(ERROR|error): notify_client:", ] def is_applicable(self): if self.CM["Name"] == "crm-lha": return None return 1 AllTestClasses.append(Reattach) #################################################################### class SpecialTest1(CTSTest): #################################################################### '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SpecialTest1" self.startall = SimulStartLite(cm) self.restart1 = RestartTest(cm) self.stopall = SimulStopLite(cm) def __call__(self, node): '''Perform the 'SpecialTest1' test for Andrew. ''' self.incr("calls") # Shut down all the nodes... ret = self.stopall(None) if not ret: return self.failure("Could not stop all nodes") # Start the selected node ret = self.restart1(node) if not ret: return self.failure("Could not start "+node) # Start all remaining nodes ret = self.startall(None) if not ret: return self.failure("Could not start the remaining nodes") return self.success() AllTestClasses.append(SpecialTest1) #################################################################### class HAETest(CTSTest): #################################################################### '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="HAETest" self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) self.is_loop = 1 def setup(self, node): # Start all remaining nodes ret = self.startall(None) if not ret: return self.failure("Couldn't start all nodes") return self.success() def teardown(self, node): # Stop everything ret = self.stopall(None) if not ret: return self.failure("Couldn't stop all nodes") return self.success() def wait_on_state(self, node, resource, expected_clones, attempts=240): while attempts > 0: active=0 (rc, lines) = self.CM.rsh(node, "crm_resource -r %s -W -Q" % resource, stdout=None) # Hack until crm_resource does the right thing if rc == 0 and lines: active = len(lines) if len(lines) == expected_clones: return 1 elif rc == 1: self.CM.debug("Resource %s is still inactive" % resource) elif rc == 234: self.CM.log("Unknown resource %s" % resource) return 0 elif rc == 246: self.CM.log("Cluster is inactive") return 0 elif rc != 0: self.CM.log("Call to crm_resource failed, rc=%d" % rc) return 0 else: self.CM.debug("Resource %s is active on %d times instead of %d" % (resource, active, expected_clones)) attempts -= 1 time.sleep(1) return 0 def find_dlm(self, node): self.r_dlm = None (rc, lines) = self.CM.rsh(node, "crm_resource -c", None) for line in lines: if re.search("^Resource", line): r = AuditResource(self.CM, line) if r.rtype == "controld" and r.parent != "NA": self.CM.debug("Found dlm: %s" % self.r_dlm) self.r_dlm = r.parent return 1 return 0 def find_hae_resources(self, node): self.r_dlm = None self.r_o2cb = None self.r_ocfs2 = [] if self.find_dlm(node): self.find_ocfs2_resources(node) def is_applicable(self): if not self.is_applicable_common(): return 0 if self.CM.Env["Schema"] == "hae": return 1 return None #################################################################### class HAERoleTest(HAETest): #################################################################### def __init__(self, cm): '''Lars' mount/unmount test for the HA extension. ''' HAETest.__init__(self,cm) self.name="HAERoleTest" def change_state(self, node, resource, target): rc = self.CM.rsh(node, "crm_resource -V -r %s -p target-role -v %s --meta" % (resource, target)) return rc def __call__(self, node): self.incr("calls") lpc = 0 failed = 0 delay = 2 done=time.time() + self.CM.Env["loop-minutes"]*60 self.find_hae_resources(node) clone_max = len(self.CM.Env["nodes"]) while time.time() <= done and not failed: lpc = lpc + 1 self.change_state(node, self.r_dlm, "Stopped") if not self.wait_on_state(node, self.r_dlm, 0): self.failure("%s did not go down correctly" % self.r_dlm) failed = lpc self.change_state(node, self.r_dlm, "Started") if not self.wait_on_state(node, self.r_dlm, clone_max): self.failure("%s did not come up correctly" % self.r_dlm) failed = lpc if not self.wait_on_state(node, self.r_o2cb, clone_max): self.failure("%s did not come up correctly" % self.r_o2cb) failed = lpc for fs in self.r_ocfs2: if not self.wait_on_state(node, fs, clone_max): self.failure("%s did not come up correctly" % fs) failed = lpc if failed: return self.failure("iteration %d failed" % failed) return self.success() AllTestClasses.append(HAERoleTest) #################################################################### class HAEStandbyTest(HAETest): #################################################################### '''Set up a custom test to cause quorum failure issues for Andrew''' def __init__(self, cm): HAETest.__init__(self,cm) self.name="HAEStandbyTest" def change_state(self, node, resource, target): rc = self.CM.rsh(node, "crm_standby -V -l reboot -v %s" % (target)) return rc def __call__(self, node): self.incr("calls") lpc = 0 failed = 0 done=time.time() + self.CM.Env["loop-minutes"]*60 self.find_hae_resources(node) clone_max = len(self.CM.Env["nodes"]) while time.time() <= done and not failed: lpc = lpc + 1 self.change_state(node, self.r_dlm, "true") if not self.wait_on_state(node, self.r_dlm, clone_max-1): self.failure("%s did not go down correctly" % self.r_dlm) failed = lpc self.change_state(node, self.r_dlm, "false") if not self.wait_on_state(node, self.r_dlm, clone_max): self.failure("%s did not come up correctly" % self.r_dlm) failed = lpc if not self.wait_on_state(node, self.r_o2cb, clone_max): self.failure("%s did not come up correctly" % self.r_o2cb) failed = lpc for fs in self.r_ocfs2: if not self.wait_on_state(node, fs, clone_max): self.failure("%s did not come up correctly" % fs) failed = lpc if failed: return self.failure("iteration %d failed" % failed) return self.success() AllTestClasses.append(HAEStandbyTest) ################################################################### class NearQuorumPointTest(CTSTest): ################################################################### ''' This test brings larger clusters near the quorum point (50%). In addition, it will test doing starts and stops at the same time. Here is how I think it should work: - loop over the nodes and decide randomly which will be up and which will be down Use a 50% probability for each of up/down. - figure out what to do to get into that state from the current state - in parallel, bring up those going up and bring those going down. ''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="NearQuorumPoint" def __call__(self, dummy): '''Perform the 'NearQuorumPoint' test. ''' self.incr("calls") startset = [] stopset = [] stonith = self.CM.prepare_fencing_watcher("NearQuorumPoint") #decide what to do with each node for node in self.CM.Env["nodes"]: action = self.CM.Env.RandomGen.choice(["start","stop"]) #action = self.CM.Env.RandomGen.choice(["start","stop","no change"]) if action == "start" : startset.append(node) elif action == "stop" : stopset.append(node) self.CM.debug("start nodes:" + repr(startset)) self.CM.debug("stop nodes:" + repr(stopset)) #add search patterns watchpats = [ ] for node in stopset: if self.CM.ShouldBeStatus[node] == "up": watchpats.append(self.CM["Pat:We_stopped"] % node) for node in startset: if self.CM.ShouldBeStatus[node] == "down": #watchpats.append(self.CM["Pat:Slave_started"] % node) watchpats.append(self.CM["Pat:Local_started"] % node) else: for stopping in stopset: if self.CM.ShouldBeStatus[stopping] == "up": watchpats.append(self.CM["Pat:They_stopped"] % (node, self.CM.key_for_node(stopping))) if len(watchpats) == 0: return self.skipped() if len(startset) != 0: watchpats.append(self.CM["Pat:DC_IDLE"]) watch = self.create_watch(watchpats, self.CM["DeadTime"]+10) watch.setwatch() #begin actions for node in stopset: if self.CM.ShouldBeStatus[node] == "up": self.CM.StopaCMnoBlock(node) for node in startset: if self.CM.ShouldBeStatus[node] == "down": self.CM.StartaCMnoBlock(node) #get the result if watch.lookforall(): self.CM.cluster_stable() self.CM.fencing_cleanup("NearQuorumPoint", stonith) return self.success() self.CM.log("Warn: Patterns not found: " + repr(watch.unmatched)) #get the "bad" nodes upnodes = [] for node in stopset: if self.CM.StataCM(node) == 1: upnodes.append(node) downnodes = [] for node in startset: if self.CM.StataCM(node) == 0: downnodes.append(node) self.CM.fencing_cleanup,("NearQuorumPoint", stonith) if upnodes == [] and downnodes == []: self.CM.cluster_stable() # Make sure they're completely down with no residule for node in stopset: self.CM.rsh(node, self.CM["StopCmd"]) return self.success() if len(upnodes) > 0: self.CM.log("Warn: Unstoppable nodes: " + repr(upnodes)) if len(downnodes) > 0: self.CM.log("Warn: Unstartable nodes: " + repr(downnodes)) return self.failure() + def is_applicable(self): + if self.CM["Name"] == "crm-cman": + return None + return 1 + AllTestClasses.append(NearQuorumPointTest) ################################################################### class RollingUpgradeTest(CTSTest): ################################################################### '''Perform a rolling upgrade of the cluster''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="RollingUpgrade" self.start = StartTest(cm) self.stop = StopTest(cm) self.stopall = SimulStopLite(cm) self.startall = SimulStartLite(cm) def setup(self, node): # Start all remaining nodes ret = self.stopall(None) if not ret: return self.failure("Couldn't stop all nodes") for node in self.CM.Env["nodes"]: if not self.downgrade(node, None): return self.failure("Couldn't downgrade %s" % node) ret = self.startall(None) if not ret: return self.failure("Couldn't start all nodes") return self.success() def teardown(self, node): # Stop everything ret = self.stopall(None) if not ret: return self.failure("Couldn't stop all nodes") for node in self.CM.Env["nodes"]: if not self.upgrade(node, None): return self.failure("Couldn't upgrade %s" % node) return self.success() def install(self, node, version, start=1, flags="--force"): target_dir = "/tmp/rpm-%s" % version src_dir = "%s/%s" % (self.CM.Env["rpm-dir"], version) self.CM.log("Installing %s on %s with %s" % (version, node, flags)) if not self.stop(node): return self.failure("stop failure: "+node) rc = self.CM.rsh(node, "mkdir -p %s" % target_dir) rc = self.CM.rsh(node, "rm -f %s/*.rpm" % target_dir) (rc, lines) = self.CM.rsh(node, "ls -1 %s/*.rpm" % src_dir, None) for line in lines: line = line[:-1] rc = self.CM.rsh.cp("%s" % (line), "%s:%s/" % (node, target_dir)) rc = self.CM.rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir)) if start and not self.start(node): return self.failure("start failure: "+node) return self.success() def upgrade(self, node, start=1): return self.install(node, self.CM.Env["current-version"], start) def downgrade(self, node, start=1): return self.install(node, self.CM.Env["previous-version"], start, "--force --nodeps") def __call__(self, node): '''Perform the 'Rolling Upgrade' test. ''' self.incr("calls") for node in self.CM.Env["nodes"]: if self.upgrade(node): return self.failure("Couldn't upgrade %s" % node) self.CM.cluster_stable() return self.success() def is_applicable(self): if not self.is_applicable_common(): return None if not self.CM.Env.has_key("rpm-dir"): return None if not self.CM.Env.has_key("current-version"): return None if not self.CM.Env.has_key("previous-version"): return None return 1 # Register RestartTest as a good test to run AllTestClasses.append(RollingUpgradeTest) ################################################################### class BSC_AddResource(CTSTest): ################################################################### '''Add a resource to the cluster''' def __init__(self, cm): CTSTest.__init__(self, cm) self.name="AddResource" self.resource_offset = 0 self.cib_cmd="""cibadmin -C -o %s -X '%s' """ def __call__(self, node): self.incr("calls") self.resource_offset = self.resource_offset + 1 r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset) start_pat = "crmd.*%s_start_0.*confirmed.*ok" patterns = [] patterns.append(start_pat % r_id) watch = self.create_watch(patterns, self.CM["DeadTime"]) watch.setwatch() fields = string.split(self.CM.Env["IPBase"], '.') fields[3] = str(int(fields[3])+1) ip = string.join(fields, '.') self.CM.Env["IPBase"] = ip if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip): return self.failure("Make resource %s failed" % r_id) failed = 0 watch_result = watch.lookforall() if watch.unmatched: for regex in watch.unmatched: self.CM.log ("Warn: Pattern not found: %s" % (regex)) failed = 1 if failed: return self.failure("Resource pattern(s) not found") if not self.CM.cluster_stable(self.CM["DeadTime"]): return self.failure("Unstable cluster") return self.success() def make_ip_resource(self, node, id, rclass, type, ip): self.CM.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node)) rsc_xml=""" """ % (id, rclass, type, id, id, ip) node_constraint=""" """ % (id, id, id, id, node) rc = 0 (rc, lines) = self.CM.rsh(node, self.cib_cmd % ("constraints", node_constraint), None) if rc != 0: self.CM.log("Constraint creation failed: %d" % rc) return None (rc, lines) = self.CM.rsh(node, self.cib_cmd % ("resources", rsc_xml), None) if rc != 0: self.CM.log("Resource creation failed: %d" % rc) return None return 1 def is_applicable(self): if self.CM.Env["DoBSC"]: return 1 return None AllTestClasses.append(BSC_AddResource) class SimulStopLite(CTSTest): ################################################################### '''Stop any active nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SimulStopLite" def __call__(self, dummy): '''Perform the 'SimulStopLite' setup work. ''' self.incr("calls") self.CM.debug("Setup: " + self.name) # We ignore the "node" parameter... watchpats = [ ] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "up": self.incr("WasStarted") watchpats.append(self.CM["Pat:We_stopped"] % node) #if self.CM.Env["use_logd"]: # watchpats.append(self.CM["Pat:Logd_stopped"] % node) if len(watchpats) == 0: self.CM.clear_all_caches() return self.success() # Stop all the nodes - at about the same time... watch = self.create_watch(watchpats, self.CM["DeadTime"]+10) watch.setwatch() self.set_timer() for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "up": self.CM.StopaCMnoBlock(node) if watch.lookforall(): self.CM.clear_all_caches() # Make sure they're completely down with no residule for node in self.CM.Env["nodes"]: self.CM.rsh(node, self.CM["StopCmd"]) return self.success() did_fail=0 up_nodes = [] for node in self.CM.Env["nodes"]: if self.CM.StataCM(node) == 1: did_fail=1 up_nodes.append(node) if did_fail: return self.failure("Active nodes exist: " + repr(up_nodes)) self.CM.log("Warn: All nodes stopped but CTS didnt detect: " + repr(watch.unmatched)) self.CM.clear_all_caches() return self.failure("Missing log message: "+repr(watch.unmatched)) def is_applicable(self): '''SimulStopLite is a setup test and never applicable''' return 0 ################################################################### class SimulStartLite(CTSTest): ################################################################### '''Start any stopped nodes ~ simultaneously''' def __init__(self, cm): CTSTest.__init__(self,cm) self.name="SimulStartLite" def __call__(self, dummy): '''Perform the 'SimulStartList' setup work. ''' self.incr("calls") self.CM.debug("Setup: " + self.name) # We ignore the "node" parameter... node_list = [] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == "down": self.incr("WasStopped") node_list.append(node) self.set_timer() while len(node_list) > 0: watchpats = [ ] uppat = self.CM["Pat:Slave_started"] if self.CM.upcount() == 0: uppat = self.CM["Pat:Local_started"] watchpats.append(self.CM["Pat:DC_IDLE"]) for node in node_list: watchpats.append(uppat % node) watchpats.append(self.CM["Pat:InfraUp"] % node) watchpats.append(self.CM["Pat:PacemakerUp"] % node) # Start all the nodes - at about the same time... watch = self.create_watch(watchpats, self.CM["DeadTime"]+10) watch.setwatch() stonith = self.CM.prepare_fencing_watcher(self.name) for node in node_list: self.CM.StartaCMnoBlock(node) watch.lookforall() node_list = self.CM.fencing_cleanup(self.name, stonith) # Remove node_list messages from watch.unmatched for node in node_list: if watch.unmatched: watch.unmatched.remove(uppat % node) if watch.unmatched: for regex in watch.unmatched: self.CM.log ("Warn: Startup pattern not found: %s" %(regex)) if not self.CM.cluster_stable(): return self.failure("Cluster did not stabilize") did_fail=0 unstable = [] for node in self.CM.Env["nodes"]: if self.CM.StataCM(node) == 0: did_fail=1 unstable.append(node) if did_fail: return self.failure("Unstarted nodes exist: " + repr(unstable)) unstable = [] for node in self.CM.Env["nodes"]: if not self.CM.node_stable(node): did_fail=1 unstable.append(node) if did_fail: return self.failure("Unstable cluster nodes exist: " + repr(unstable)) return self.success() def is_applicable(self): '''SimulStartLite is a setup test and never applicable''' return 0 def TestList(cm, audits): result = [] for testclass in AllTestClasses: bound_test = testclass(cm) if bound_test.is_applicable(): bound_test.Audits = audits result.append(bound_test) return result # vim:ts=4:sw=4:et: diff --git a/doc/Clusters_from_Scratch/en-US/Ap-Configuration.txt b/doc/Clusters_from_Scratch/en-US/Ap-Configuration.txt index 48be6c48a5..5852e7eb04 100644 --- a/doc/Clusters_from_Scratch/en-US/Ap-Configuration.txt +++ b/doc/Clusters_from_Scratch/en-US/Ap-Configuration.txt @@ -1,498 +1,498 @@ [appendix] == Configuration Recap == === Final Cluster Configuration === ifdef::pcs[] [source,C] ---- # pcs resource Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-2 pcmk-1 ] Clone Set: dlm-clone [dlm] Started: [ pcmk-2 pcmk-1 ] Clone Set: ClusterIP-clone [ClusterIP] (unique) ClusterIP:0 (ocf::heartbeat:IPaddr2) Started ClusterIP:1 (ocf::heartbeat:IPaddr2) Started Clone Set: WebFS-clone [WebFS] Started: [ pcmk-1 pcmk-2 ] Clone Set: WebSite-clone [WebSite] Started: [ pcmk-1 pcmk-2 ] # pcs resource rsc defaults resource-stickiness: 100 # pcs resource op defaults timeout: 240s # pcs stonith impi-fencing (stonith:fence_ipmilan) Started # pcs property dc-version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 cluster-infrastructure: corosync no-quorum-policy: ignore stonith-enabled: true # pcs constraint Location Constraints: Ordering Constraints: ClusterIP-clone then WebSite-clone WebDataClone then WebSite-clone WebFS-clone then WebSite-clone Colocation Constraints: WebSite-clone with ClusterIP-clone WebFS-clone with WebDataClone (with-rsc-role:Master) WebSite-clone with WebFS-clone # # pcs status Last updated: Fri Sep 14 13:45:34 2012 Last change: Fri Sep 14 13:43:13 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-1 (1) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 11 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-2 pcmk-1 ] Clone Set: dlm-clone [dlm] Started: [ pcmk-1 pcmk-2 ] Clone Set: ClusterIP-clone [ClusterIP] (unique) ClusterIP:0 (ocf::heartbeat:IPaddr2): Started pcmk-1 ClusterIP:1 (ocf::heartbeat:IPaddr2): Started pcmk-2 Clone Set: WebFS-clone [WebFS] Started: [ pcmk-1 pcmk-2 ] Clone Set: WebSite-clone [WebSite] Started: [ pcmk-1 pcmk-2 ] impi-fencing (stonith:fence_ipmilan): Started ---- In xml it should look similar to this. [source,XML] ---- ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] ..... # crm configure show node pcmk-1 node pcmk-2 primitive WebData ocf:linbit:drbd \ params drbd_resource="wwwdata" \ op monitor interval="60s" primitive WebFS ocf:heartbeat:Filesystem \ params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ op monitor interval="30s" primitive ipmi-fencing stonith::fence_ipmilan \ params pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser passwd=abc123 \ op monitor interval="60s" ms WebDataClone WebData \ meta master-max="2" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" clone WebFSClone WebFS clone WebIP ClusterIP \ meta globally-unique="true" clone-max="2" clone-node-max="2" clone WebSiteClone WebSite colocation WebSite-with-WebFS inf: WebSiteClone WebFSClone colocation fs_on_drbd inf: WebFSClone WebDataClone:Master colocation website-with-ip inf: WebSiteClone WebIP order WebFS-after-WebData inf: WebDataClone:promote WebFSClone:start order WebSite-after-WebFS inf: WebFSClone WebSiteClone order apache-after-ip inf: WebIP WebSiteClone property $id="cib-bootstrap-options" \ dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ cluster-infrastructure="openais" \ expected-quorum-votes="2" \ stonith-enabled="true" \ no-quorum-policy="ignore" rsc_defaults $id="rsc-options" \ resource-stickiness="100" ..... endif::[] === Node List === The list of cluster nodes is automatically populated by the cluster. ifdef::pcs[] ..... Pacemaker Nodes: Online: [ pcmk-1 pcmk-2 ] ..... endif::[] -ifdef::crm[] +ifdef::crmsh[] ..... node pcmk-1 node pcmk-2 ..... endif::[] === Cluster Options === This is where the cluster automatically stores some information about the cluster * dc-version - the version (including upstream source-code hash) of Pacemaker used on the DC * cluster-infrastructure - the cluster infrastructure being used (heartbeat or openais) * expected-quorum-votes - the maximum number of nodes expected to be part of the cluster and where the admin can set options that control the way the cluster operates * stonith-enabled=true - Make use of STONITH * no-quorum-policy=ignore - Ignore loss of quorum and continue to host resources. ifdef::pcs[] [source,C] ---- # pcs property dc-version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 cluster-infrastructure: corosync no-quorum-policy: ignore stonith-enabled: true ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] ..... property $id="cib-bootstrap-options" \ dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ cluster-infrastructure="openais" \ expected-quorum-votes="2" \ stonith-enabled="true" \ no-quorum-policy="ignore" ..... endif::[] === Resources === ==== Default Options ==== Here we configure cluster options that apply to every resource. ifdef::pcs[] * resource-stickiness - Specify the aversion to moving resources to other machines [source,C] ---- # pcs resource rsc defaults resource-stickiness: 100 ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] * resource-stickiness - Specify the aversion to moving resources to other machines ..... rsc_defaults $id="rsc-options" \ resource-stickiness="100" ..... endif::[] ==== Fencing ==== ifdef::pcs[] [source,C] ---- # pcs stonith show impi-fencing (stonith:fence_ipmilan) Started # pcs stonith show impi-fencing Resource: impi-fencing pcmk_host_list: pcmk-1 pcmk-2 ipaddr: 10.0.0.1 login: testuser passwd: acd123 ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] ..... primitive ipmi-fencing stonith::fence_ipmilan \ params pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser passwd=abc123 \ op monitor interval="60s" clone Fencing rsa-fencing ..... endif::[] ==== Service Address ==== Users of the services provided by the cluster require an unchanging address with which to access it. Additionally, we cloned the address so it will be active on both nodes. An iptables rule (created as part of the resource agent) is used to ensure that each request only gets processed by one of the two clone instances. The additional meta options tell the cluster that we want two instances of the clone (one "request bucket" for each node) and that if one node fails, then the remaining node should hold both. ifdef::pcs[] [source,C] ---- # pcs resource show ClusterIP-clone Resource: ClusterIP-clone ip: 192.168.0.120 cidr_netmask: 32 clusterip_hash: sourceip globally-unique: true clone-max: 2 clone-node-max: 2 op monitor interval=30s ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] ..... primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ op monitor interval="30s" clone WebIP ClusterIP meta globally-unique="true" clone-max="2" clone-node-max="2" ..... endif::[] [NOTE] ======= TODO: The RA should check for globally-unique=true when cloned ======= ==== DRBD - Shared Storage ==== Here we define the DRBD service and specify which DRBD resource (from drbd.conf) it should manage. We make it a master/slave resource and, in order to have an active/active setup, allow both instances to be promoted by specifying master-max=2. We also set the notify option so that the cluster will tell DRBD agent when it's peer changes state. ifdef::pcs[] [source,C] ---- # pcs resource show WebDataClone Resource: WebDataClone drbd_resource: wwwdata master-node-max: 1 clone-max: 2 clone-node-max: 1 notify: true master-max: 2 op monitor interval=60s # pcs constraint ref WebDataClone Resource: WebDataClone colocation-WebFS-WebDataClone-INFINITY order-WebDataClone-WebFS-mandatory ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] ..... primitive WebData ocf:linbit:drbd \ params drbd_resource="wwwdata" \ op monitor interval="60s" ms WebDataClone WebData \ meta master-max="2" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" ..... endif::[] ==== Cluster Filesystem ==== The cluster filesystem ensures that files are read and written correctly. We need to specify the block device (provided by DRBD), where we want it mounted and that we are using GFS2. Again it is a clone because it is intended to be active on both nodes. The additional constraints ensure that it can only be started on nodes with active gfs-control and drbd instances. ifdef::pcs[] [source,C] ---- # pcs resource show WebFS-clone Resource: WebFS-clone device: /dev/drbd/by-res/wwwdata directory: /var/www/html fstype: gfs2 # pcs constraint ref WebFS-clone Resource: WebFS-clone colocation-WebFS-WebDataClone-INFINITY colocation-WebSite-WebFS-INFINITY order-WebFS-WebSite-mandatory order-WebDataClone-WebFS-mandatory ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] ..... primitive WebFS ocf:heartbeat:Filesystem \ params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" clone WebFSClone WebFS colocation WebFS-with-gfs-control inf: WebFSClone gfs-clone colocation fs_on_drbd inf: WebFSClone WebDataClone:Master order WebFS-after-WebData inf: WebDataClone:promote WebFSClone:start order start-WebFS-after-gfs-control inf: gfs-clone WebFSClone ..... endif::[] ==== Apache ==== Lastly we have the actual service, Apache. We need only tell the cluster where to find it's main configuration file and restrict it to running on nodes that have the required filesystem mounted and the IP address active. ifdef::pcs[] [source,C] ---- # pcs resource show WebSite-clone Resource: WebSite-clone configfile: /etc/httpd/conf/httpd.conf statusurl: http://localhost/server-status master-max: 2 op monitor interval=1min # pcs constraint ref WebSite-clone Resource: WebSite-clone colocation-WebSite-ClusterIP-INFINITY colocation-WebSite-WebFS-INFINITY order-ClusterIP-WebSite-mandatory order-WebFS-WebSite-mandatory ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] ..... primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" clone WebSiteClone WebSite colocation WebSite-with-WebFS inf: WebSiteClone WebFSClone colocation website-with-ip inf: WebSiteClone WebIP order apache-after-ip inf: WebIP WebSiteClone order WebSite-after-WebFS inf: WebFSClone WebSiteClone ..... endif::[] diff --git a/doc/Clusters_from_Scratch/en-US/Ap-Corosync-Conf.txt b/doc/Clusters_from_Scratch/en-US/Ap-Corosync-Conf.txt index 4fa502a589..0ab5d5767f 100644 --- a/doc/Clusters_from_Scratch/en-US/Ap-Corosync-Conf.txt +++ b/doc/Clusters_from_Scratch/en-US/Ap-Corosync-Conf.txt @@ -1,121 +1,121 @@ [appendix] == Sample Corosync Configuration == ifdef::pcs[] .Sample corosync.conf for two-node cluster using a node list. ..... # Please read the corosync.conf.5 manual page totem { version: 2 secauth: off cluster_name: mycluster transport: udpu } nodelist { node { ring0_addr: pcmk-1 nodeid: 1 } node { ring0_addr: pcmk-2 nodeid: 2 } } quorum { provider: corosync_votequorum } logging { to_syslog: yes } ..... endif::[] -ifdef::crm[] +ifdef::crmsh[] .Sample Corosync.conf for a two-node cluster using multicast. ..... # Please read the corosync.conf.5 manual page totem { version: 2 # cypto_cipher and crypto_hash: Used for mutual node authentication. # If you choose to enable this, then do remember to create a shared # secret with "corosync-keygen". crypto_cipher: none crypto_hash: none # interface: define at least one interface to communicate # over. If you define more than one interface stanza, you must # also set rrp_mode. interface { # Rings must be consecutively numbered, starting at 0. ringnumber: 0 # This is normally the *network* address of the # interface to bind to. This ensures that you can use # identical instances of this configuration file # across all your cluster nodes, without having to # modify this option. bindnetaddr: 192.168.122.0 # However, if you have multiple physical network # interfaces configured for the same subnet, then the # network address alone is not sufficient to identify # the interface Corosync should bind to. In that case, # configure the *host* address of the interface # instead: # bindnetaddr: 192.168.1.1 # When selecting a multicast address, consider RFC # 2365 (which, among other things, specifies that # 239.255.x.x addresses are left to the discretion of # the network administrator). Do not reuse multicast # addresses across multiple Corosync clusters sharing # the same network. mcastaddr: 239.255.1.1 # Corosync uses the port you specify here for UDP # messaging, and also the immediately preceding # port. Thus if you set this to 5405, Corosync sends # messages over UDP ports 5405 and 5404. mcastport: 4000 # Time-to-live for cluster communication packets. The # number of hops (routers) that this ring will allow # itself to pass. Note that multicast routing must be # specifically enabled on most network routers. ttl: 1 } } logging { # Log the source file and line where messages are being # generated. When in doubt, leave off. Potentially useful for # debugging. fileline: off # Log to standard error. When in doubt, set to no. Useful when # running in the foreground (when invoking "corosync -f") to_stderr: no # Log to a log file. When set to "no", the "logfile" option # must not be set. to_logfile: yes logfile: /var/log/cluster/corosync.log # Log to the system log daemon. When in doubt, set to yes. to_syslog: yes # Log debug messages (very verbose). When in doubt, leave off. debug: off # Log messages with time stamps. When in doubt, set to on # (unless you are only logging to syslog, where double # timestamps can be annoying). timestamp: on logger_subsys { subsys: QUORUM debug: off } } quorum { provider: corosync_votequorum expected_votes: 2 } ..... endif::[] diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Active-Active.txt b/doc/Clusters_from_Scratch/en-US/Ch-Active-Active.txt index 571d9ed5a0..eba78be997 100644 --- a/doc/Clusters_from_Scratch/en-US/Ch-Active-Active.txt +++ b/doc/Clusters_from_Scratch/en-US/Ch-Active-Active.txt @@ -1,755 +1,755 @@ = Conversion to Active/Active = == Requirements == The primary requirement for an Active/Active cluster is that the data required for your services is available, simultaneously, on both machines. Pacemaker makes no requirement on how this is achieved, you could use a SAN if you had one available, however since DRBD supports multiple Primaries, we can also use that. The only hitch is that we need to use a cluster-aware filesystem. The one we used earlier with DRBD, ext4, is not one of those. Both OCFS2 and GFS2 are supported, however here we will use GFS2 which comes with Fedora 17. === Installing the required Software === [source,C] # yum install -y gfs2-utils dlm kernel-modules-extra ..... Loaded plugins: langpacks, presto, refresh-packagekit Resolving Dependencies --> Running transaction check ---> Package dlm.x86_64 0:3.99.4-1.fc17 will be installed ---> Package gfs2-utils.x86_64 0:3.1.4-3.fc17 will be installed ---> Package kernel-modules-extra.x86_64 0:3.4.4-3.fc17 will be installed --> Finished Dependency Resolution Dependencies Resolved ================================================================================ Package Arch Version Repository Size ================================================================================ Installing: dlm x86_64 3.99.4-1.fc17 updates 83 k gfs2-utils x86_64 3.1.4-3.fc17 fedora 214 k kernel-modules-extra x86_64 3.4.4-3.fc17 updates 1.7 M Transaction Summary ================================================================================ Install 3 Packages Total download size: 1.9 M Installed size: 7.7 M Downloading Packages: (1/3): dlm-3.99.4-1.fc17.x86_64.rpm | 83 kB 00:00 (2/3): gfs2-utils-3.1.4-3.fc17.x86_64.rpm | 214 kB 00:00 (3/3): kernel-modules-extra-3.4.4-3.fc17.x86_64.rpm | 1.7 MB 00:01 -------------------------------------------------------------------------------- Total 615 kB/s | 1.9 MB 00:03 Running Transaction Check Running Transaction Test Transaction Test Succeeded Running Transaction Installing : kernel-modules-extra-3.4.4-3.fc17.x86_64 1/3 Installing : gfs2-utils-3.1.4-3.fc17.x86_64 2/3 Installing : dlm-3.99.4-1.fc17.x86_64 3/3 Verifying : dlm-3.99.4-1.fc17.x86_64 1/3 Verifying : gfs2-utils-3.1.4-3.fc17.x86_64 2/3 Verifying : kernel-modules-extra-3.4.4-3.fc17.x86_64 3/3 Installed: dlm.x86_64 0:3.99.4-1.fc17 gfs2-utils.x86_64 0:3.1.4-3.fc17 kernel-modules-extra.x86_64 0:3.4.4-3.fc17 Complete! ..... == Create a GFS2 Filesystem == [[GFS2_prep]] === Preparation === Before we do anything to the existing partition, we need to make sure it is unmounted. We do this by telling the cluster to stop the WebFS resource. This will ensure that other resources (in our case, Apache) using WebFS are not only stopped, but stopped in the correct order. ifdef::pcs[] [source,C] ---- # pcs resource stop WebFS # pcs resource ClusterIP (ocf::heartbeat:IPaddr2) Started WebSite (ocf::heartbeat:apache) Stopped Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-2 ] Slaves: [ pcmk-1 ] WebFS (ocf::heartbeat:Filesystem) Stopped ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ----- # crm resource stop WebFS # crm_mon -1 ============ Last updated: Tue Apr 3 14:07:36 2012 Last change: Tue Apr 3 14:07:15 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-1 (1702537408) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 5 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-2 ] Slaves: [ pcmk-1 ] ----- endif::[] [NOTE] ======= Note that both Apache and WebFS have been stopped. ======= === Create and Populate an GFS2 Partition === Now that the cluster stack and integration pieces are running smoothly, we can create an GFS2 partition. [WARNING] ========= This will erase all previous content stored on the DRBD device. Ensure you have a copy of any important data. ========= We need to specify a number of additional parameters when creating a GFS2 partition. First we must use the -p option to specify that we want to use the the Kernel's DLM. Next we use -j to indicate that it should reserve enough space for two journals (one per node accessing the filesystem). ifdef::pcs[] Lastly, we use -t to specify the lock table name. The format for this field is +clustername:fsname+. For the +fsname+, we need to use the same value as specified in 'corosync.conf' for +cluster_name+. If you setup corosync with the same cluster name we used in this tutorial, cluster name will be 'mycluster'. If you are unsure what your cluster name is, open up /etc/corosync/corosync.conf, or execute the command 'pcs cluster corosync pcmk-1' to view the corosync config. The cluster name will be in the +totem+ block. endif::[] -ifdef::crm[] +ifdef::crmsh[] Lastly, we use -t to specify the lock table name. The format for this field is +clustername:fsname+. For the +fsname+, we need to use the same value as specified in 'corosync.conf' for +cluster_name+. Just pick something unique and descriptive and add somewhere inside the +totem+ block. For example: ..... totem { version: 2 # cypto_cipher and crypto_hash: Used for mutual node authentication. # If you choose to enable this, then do remember to create a shared # secret with "corosync-keygen". crypto_cipher: none crypto_hash: none cluster_name: mycluster ... ..... [IMPORTANT] =========== Do this on each node in the cluster and be sure to restart them before continuing. =========== endif::[] [IMPORTANT] =========== We must run the next command on whichever node last had '/dev/drbd' mounted. Otherwise you will receive the message: ----- /dev/drbd1: Read-only file system ----- =========== [source,C] ----- # ssh pcmk-2 -- mkfs.gfs2 -p lock_dlm -j 2 -t mycluster:web /dev/drbd1 This will destroy any data on /dev/drbd1. It appears to contain: Linux rev 1.0 ext4 filesystem data, UUID=dc45fff3-c47a-4db2-96f7-a8049a323fe4 (extents) (large files) (huge files) Are you sure you want to proceed? [y/n]y Device: /dev/drbd1 Blocksize: 4096 Device Size 0.97 GB (253935 blocks) Filesystem Size: 0.97 GB (253932 blocks) Journals: 2 Resource Groups: 4 Locking Protocol: "lock_dlm" Lock Table: "mycluster" UUID: ed293a02-9eee-3fa3-ed1c-435ef1fd0116 ----- ifdef::pcs[] [source,C] ---- # pcs cluster cib dlm_cfg # pcs -f dlm_cfg resource create dlm ocf:pacemaker:controld op monitor interval=60s # pcs -f dlm_cfg resource clone dlm clone-max=2 clone-node-max=1 # pcs -f dlm_cfg resource show ClusterIP (ocf::heartbeat:IPaddr2) Started WebSite (ocf::heartbeat:apache) Stopped Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-2 ] Slaves: [ pcmk-1 ] WebFS (ocf::heartbeat:Filesystem) Stopped Clone Set: dlm-clone [dlm] Stopped: [ dlm:0 dlm:1 ] # pcs cluster push cib dlm_cfg CIB updated # pcs status Last updated: Fri Sep 14 12:54:50 2012 Last change: Fri Sep 14 12:54:43 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-1 (1) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 7 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 WebSite (ocf::heartbeat:apache): Stopped Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-2 ] Slaves: [ pcmk-1 ] WebFS (ocf::heartbeat:Filesystem): Stopped Clone Set: dlm-clone [dlm] Started: [ pcmk-1 pcmk-2 ] ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ----- # crm crm(live)# cib new dlm INFO: dlm shadow CIB created crm(dlm)# configure primitive dlm ocf:pacemaker:controld \ op monitor interval=60s crm(dlm)# configure clone dlm_clone dlm meta clone-max=2 clone-node-max=1 crm(dlm)# configure show node $id="1702537408" pcmk-1 \ attributes standby="off" node $id="1719314624" pcmk-2 primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.120" cidr_netmask="32" \ op monitor interval="30s" primitive WebData ocf:linbit:drbd \ params drbd_resource="wwwdata" \ op monitor interval="60s" primitive WebFS ocf:heartbeat:Filesystem \ params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="ext4" \ meta target-role="Stopped" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" primitive dlm ocf:pacemaker:controld \ op monitor interval="60s" ms WebDataClone WebData \ meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" clone dlm_clone dlm \ meta clone-max="2" clone-node-max="1" location prefer-pcmk-1 WebSite 50: pcmk-1 colocation WebSite-with-WebFS inf: WebSite WebFS colocation fs_on_drbd inf: WebFS WebDataClone:Master colocation website-with-ip inf: WebSite ClusterIP order WebFS-after-WebData inf: WebDataClone:promote WebFS:start order WebSite-after-WebFS inf: WebFS WebSite order apache-after-ip inf: ClusterIP WebSite property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" \ stonith-enabled="false" \ no-quorum-policy="ignore" \ last-lrm-refresh="1333446866" rsc_defaults $id="rsc-options" \ resource-stickiness="100" op_defaults $id="op-options" \ timeout="240s" crm(dlm)# cib commit dlm INFO: commited 'dlm' shadow CIB to the cluster crm(dlm)# quit bye # crm_mon -1 ============ Last updated: Wed Apr 4 01:15:11 2012 Last change: Wed Apr 4 00:50:11 2012 via crmd on pcmk-1 Stack: corosync Current DC: pcmk-1 (1702537408) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 7 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-1 ] Slaves: [ pcmk-2 ] Clone Set: dlm_clone [dlm] Started: [ pcmk-1 pcmk-2 ] ----- endif::[] Then (re)populate the new filesystem with data (web pages). For now we'll create another variation on our home page. [source,C] ----- # mount /dev/drbd1 /mnt/ # cat <<-END >/mnt/index.html My Test Site - GFS2 END # umount /dev/drbd1 # drbdadm verify wwwdata# ----- == Reconfigure the Cluster for GFS2 == ifdef::pcs[] With the WebFS resource stopped, lets update the configuration. [source,C] ---- # pcs resource show WebFS Resource: WebFS device: /dev/drbd/by-res/wwwdata directory: /var/www/html fstype: ext4 target-role: Stopped ---- The fstype option needs to be updated to gfs2 instead of ext4. [source,C] ---- # pcs resource update WebFS fstype=gfs2 # pcs resource show WebFS Resource: WebFS device: /dev/drbd/by-res/wwwdata directory: /var/www/html fstype: gfs2 target-role: Stopped CIB updated ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ----- # crm crm(live) # cib new GFS2 INFO: GFS2 shadow CIB created crm(GFS2) # configure delete WebFS crm(GFS2) # configure primitive WebFS ocf:heartbeat:Filesystem params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" ----- Now that we've recreated the resource, we also need to recreate all the constraints that used it. This is because the shell will automatically remove any constraints that referenced WebFS. [source,C] ----- crm(GFS2) # configure colocation WebSite-with-WebFS inf: WebSite WebFS crm(GFS2) # configure colocation fs_on_drbd inf: WebFS WebDataClone:Master crm(GFS2) # configure order WebFS-after-WebData inf: WebDataClone:promote WebFS:start crm(GFS2) # configure order WebSite-after-WebFS inf: WebFS WebSite crm(GFS2) # configure show node pcmk-1 node pcmk-2 primitive WebData ocf:linbit:drbd \ params drbd_resource="wwwdata" \ op monitor interval="60s" primitive WebFS ocf:heartbeat:Filesystem \ params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.101" cidr_netmask="32" \ op monitor interval="30s" ms WebDataClone WebData \ meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" colocation WebSite-with-WebFS inf: WebSite WebFS colocation fs_on_drbd inf: WebFS WebDataClone:Master colocation website-with-ip inf: WebSite ClusterIP order WebFS-after-WebData inf: WebDataClone:promote WebFS:start order WebSite-after-WebFS inf: WebFS WebSite order apache-after-ip inf: ClusterIP WebSite property $id="cib-bootstrap-options" \ dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ cluster-infrastructure="openais" \ expected-quorum-votes="2" \ stonith-enabled="false" \ no-quorum-policy="ignore" rsc_defaults $id="rsc-options" \ resource-stickiness="100" ----- Review the configuration before uploading it to the cluster, quitting the shell and watching the cluster's response [source,C] ----- crm(GFS2) # cib commit GFS2 INFO: commited 'GFS2' shadow CIB to the cluster crm(GFS2) # quit bye # crm_mon ============ Last updated: Thu Sep 3 20:49:54 2009 Stack: openais Current DC: pcmk-2 - partition with quorum Version: 1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f 2 Nodes configured, 2 expected votes 6 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] WebSite (ocf::heartbeat:apache): Started pcmk-2 Master/Slave Set: WebDataClone Masters: [ pcmk-1 ] Slaves: [ pcmk-2 ] ClusterIP (ocf::heartbeat:IPaddr): Started pcmk-2WebFS (ocf::heartbeat:Filesystem): Started pcmk-1 ----- endif::[] == Reconfigure Pacemaker for Active/Active == Almost everything is in place. Recent versions of DRBD are capable of operating in Primary/Primary mode and the filesystem we're using is cluster aware. All we need to do now is reconfigure the cluster to take advantage of this. ifdef::pcs[] This will involve a number of changes, so we'll want work with a local cib file. [source,C] ---- # pcs cluster cib active_cfg ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] This will involve a number of changes, so we'll again use interactive mode. [source,C] ----- # crm # cib new active ----- endif::[] There's no point making the services active on both locations if we can't reach them, so lets first clone the IP address. Cloned IPaddr2 resources use an iptables rule to ensure that each request only gets processed by one of the two clone instances. The additional meta options tell the cluster how many instances of the clone we want (one "request bucket" for each node) and that if all other nodes fail, then the remaining node should hold all of them. Otherwise the requests would be simply discarded. ifdef::pcs[] ---- # pcs -f active_cfg resource clone ClusterIP \ globally-unique=true clone-max=2 clone-node-max=2 ---- Notice when the ClusterIP becomes a clone, the constraints referencing ClusterIP now reference the clone. This is done automatically by pcs. endif::[] ifdef::pcs[] [source,C] ---- # pcs -f active_cfg constraint Location Constraints: Ordering Constraints: start ClusterIP-clone then start WebSite WebFS then WebSite promote WebDataClone then start WebFS Colocation Constraints: WebSite with ClusterIP-clone WebFS with WebDataClone (with-rsc-role:Master) WebSite with WebFS ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ----- # configure clone WebIP ClusterIP \ meta globally-unique="true" clone-max="2" clone-node-max="2" ----- endif::[] Now we must tell the ClusterIP how to decide which requests are processed by which hosts. To do this we must specify the clusterip_hash parameter. ifdef::pcs[] [source,C] ---- # pcs -f active_cfg resource update ClusterIP clusterip_hash=sourceip ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] Open the ClusterIP resource [source,C] ----- # configure edit ClusterIP ----- And add the following to the params line ..... clusterip_hash="sourceip" ..... So that the complete definition looks like: ..... primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ op monitor interval="30s" ..... Here is the full transcript [source,C] ----- # crm crm(live) # cib new active INFO: active shadow CIB created crm(active) # configure clone WebIP ClusterIP \ meta globally-unique="true" clone-max="2" clone-node-max="2" crm(active) # configure shownode pcmk-1 node pcmk-2 primitive WebData ocf:linbit:drbd \ params drbd_resource="wwwdata" \ op monitor interval="60s" primitive WebFS ocf:heartbeat:Filesystem \ params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ op monitor interval="30s" ms WebDataClone WebData \ meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" clone WebIP ClusterIP \ meta globally-unique="true" clone-max="2" clone-node-max="2" colocation WebSite-with-WebFS inf: WebSite WebFS colocation fs_on_drbd inf: WebFS WebDataClone:Master colocation website-with-ip inf: WebSite WebIPorder WebFS-after-WebData inf: WebDataClone:promote WebFS:start order WebSite-after-WebFS inf: WebFS WebSiteorder apache-after-ip inf: WebIP WebSite property $id="cib-bootstrap-options" \ dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ cluster-infrastructure="openais" \ expected-quorum-votes="2" \ stonith-enabled="false" \ no-quorum-policy="ignore" rsc_defaults $id="rsc-options" \ resource-stickiness="100" ----- Notice how any constraints that referenced ClusterIP have been updated to use WebIP instead. This is an additional benefit of using the crm shell. endif::[] Next we need to convert the filesystem and Apache resources into clones. ifdef::pcs[] Notice how pcs automatically updates the relevant constraints again. [source,C] ---- # pcs -f active_cfg resource clone WebFS # pcs -f active_cfg resource clone WebSite # pcs -f active_cfg constraint Location Constraints: Ordering Constraints: start ClusterIP-clone then start WebSite-clone WebFS-clone then WebSite-clone promote WebDataClone then start WebFS-clone Colocation Constraints: WebSite-clone with ClusterIP-clone WebFS-clone with WebDataClone (with-rsc-role:Master) WebSite-clone with WebFS-clone ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] Again, the shell will automatically update any relevant constraints. [source,C] ----- crm(active) # configure clone WebFSClone WebFS crm(active) # configure clone WebSiteClone WebSite ----- endif::[] The last step is to tell the cluster that it is now allowed to promote both instances to be Primary (aka. Master). ifdef::pcs[] [source,C] ----- # pcs -f active_cfg resource update WebDataClone master-max=2 ----- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ----- crm(active) # configure edit WebDataClone ----- Change master-max to 2 [source,C] ----- crm(active) # configure show node pcmk-1 node pcmk-2 primitive WebData ocf:linbit:drbd \ params drbd_resource="wwwdata" \ op monitor interval="60s" primitive WebFS ocf:heartbeat:Filesystem \ params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ op monitor interval="30s" ms WebDataClone WebData \ meta master-max="2" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" clone WebFSClone WebFSclone WebIP ClusterIP \ meta globally-unique="true" clone-max="2" clone-node-max="2" clone WebSiteClone WebSitecolocation WebSite-with-WebFS inf: WebSiteClone WebFSClone colocation fs_on_drbd inf: WebFSClone WebDataClone:Master colocation website-with-ip inf: WebSiteClone WebIP order WebFS-after-WebData inf: WebDataClone:promote WebFSClone:start order WebSite-after-WebFS inf: WebFSClone WebSiteClone order apache-after-ip inf: WebIP WebSiteClone property $id="cib-bootstrap-options" \ dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ cluster-infrastructure="openais" \ expected-quorum-votes="2" \ stonith-enabled="false" \ no-quorum-policy="ignore" rsc_defaults $id="rsc-options" \ resource-stickiness="100" ----- endif::[] Review the configuration before uploading it to the cluster, quitting the shell and watching the cluster's response ifdef::pcs[] [source,C] ----- # pcs cluster push cib active_cfg # pcs resource start WebFS ----- After all the processes are started the status should look similar to this. [source,C] ----- # pcs resource Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-2 pcmk-1 ] Clone Set: dlm-clone [dlm] Started: [ pcmk-2 pcmk-1 ] Clone Set: ClusterIP-clone [ClusterIP] (unique) ClusterIP:0 (ocf::heartbeat:IPaddr2) Started ClusterIP:1 (ocf::heartbeat:IPaddr2) Started Clone Set: WebFS-clone [WebFS] Started: [ pcmk-1 pcmk-2 ] Clone Set: WebSite-clone [WebSite] Started: [ pcmk-1 pcmk-2 ] ----- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ----- crm(active) # cib commit active INFO: commited 'active' shadow CIB to the cluster crm(active) # quit bye # crm_mon ============ Last updated: Thu Sep 3 21:37:27 2009 Stack: openais Current DC: pcmk-2 - partition with quorum Version: 1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f 2 Nodes configured, 2 expected votes 6 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] Master/Slave Set: WebDataClone Masters: [ pcmk-1 pcmk-2 ] Clone Set: WebIP Started: [ pcmk-1 pcmk-2 ] Clone Set: WebFSClone Started: [ pcmk-1 pcmk-2 ] Clone Set: WebSiteClone Started: [ pcmk-1 pcmk-2 ] Clone Set: dlm_clone Started: [ pcmk-1 pcmk-2 ] ----- endif::[] === Testing Recovery === [NOTE] ======= TODO: Put one node into standby to demonstrate failover ======= diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Active-Passive.txt b/doc/Clusters_from_Scratch/en-US/Ch-Active-Passive.txt index 44dd9f766d..7da8fca2d9 100644 --- a/doc/Clusters_from_Scratch/en-US/Ch-Active-Passive.txt +++ b/doc/Clusters_from_Scratch/en-US/Ch-Active-Passive.txt @@ -1,683 +1,683 @@ = Creating an Active/Passive Cluster = == Exploring the Existing Configuration == When Pacemaker starts up, it automatically records the number and details of the nodes in the cluster as well as which stack is being used and the version of Pacemaker being used. This is what the base configuration should look like. ifdef::pcs[] [source,C] ---- # pcs status Last updated: Fri Sep 14 10:12:01 2012 Last change: Fri Sep 14 09:51:55 2012 via crmd on pcmk-2 Stack: corosync Current DC: pcmk-1 (1) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 0 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # crm configure show node $id="1702537408" pcmk-1 node $id="1719314624" pcmk-2 property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" ---- endif::[] ifdef::pcs[] For those that are not of afraid of XML, you can see the raw cluster configuration and status by using the +pcs cluster cib+ command. .The last XML you'll see in this document ====== [source,C] ---- # pcs cluster cib ---- [source,XML] ---- ---- ====== endif::[] -ifdef::crm[] +ifdef::crmsh[] For those that are not of afraid of XML, you can see the raw configuration by appending "xml" to the previous command. .The last XML you'll see in this document ====== [source,C] ---- # crm configure show xml ---- [source,XML] ---- ---- ====== endif::[] Before we make any changes, its a good idea to check the validity of the configuration. [source,C] ---- # crm_verify -L -V error: unpack_resources: Resource start-up disabled since no STONITH resources have been defined error: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option error: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity Errors found during check: config not valid -V may provide more details ---- As you can see, the tool has found some errors. In order to guarantee the safety of your data footnote:[If the data is corrupt, there is little point in continuing to make it available] , the default for STONITH footnote:[A common node fencing mechanism. Used to ensure data integrity by powering off "bad" nodes] in Pacemaker is +enabled+. However it also knows when no STONITH configuration has been supplied and reports this as a problem (since the cluster would not be able to make progress if a situation requiring node fencing arose). For now, we will disable this feature and configure it later in the Configuring STONITH section. It is important to note that the use of STONITH is highly encouraged, turning it off tells the cluster to simply pretend that failed nodes are safely powered off. Some vendors will even refuse to support clusters that have it disabled. To disable STONITH, we set the _stonith-enabled_ cluster option to false. ifdef::pcs[] [source,C] ---- # pcs property set stonith-enabled=false # crm_verify -L ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # crm configure property stonith-enabled=false # crm_verify -L ---- endif::[] With the new cluster option set, the configuration is now valid. [WARNING] ========= The use of stonith-enabled=false is completely inappropriate for a production cluster. We use it here to defer the discussion of its configuration which can differ widely from one installation to the next. See <<_what_is_stonith>> for information on why STONITH is important and details on how to configure it. ========= == Adding a Resource == The first thing we should do is configure an IP address. Regardless of where the cluster service(s) are running, we need a consistent address to contact them on. Here I will choose and add 192.168.122.120 as the floating address, give it the imaginative name ClusterIP and tell the cluster to check that its running every 30 seconds. [IMPORTANT] =========== The chosen address must not be one already associated with a physical node =========== //// No syntax highlighting here to avoid line munging with source,C //// ifdef::pcs[] ---- # pcs resource create ClusterIP ocf:heartbeat:IPaddr2 \ ip=192.168.0.120 cidr_netmask=32 op monitor interval=30s ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] ---- # crm configure primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip=192.168.122.120 cidr_netmask=32 \ op monitor interval=30s ---- endif::[] The other important piece of information here is ocf:heartbeat:IPaddr2. This tells Pacemaker three things about the resource you want to add. The first field, ocf, is the standard to which the resource script conforms to and where to find it. The second field is specific to OCF resources and tells the cluster which namespace to find the resource script in, in this case heartbeat. The last field indicates the name of the resource script. ifdef::pcs[] To obtain a list of the available resource standards (the ocf part of ocf:heartbeat:IPaddr2), run [source,C] ---- # pcs resource standards ocf lsb service systemd stonith ---- To obtain a list of the available ocf resource providers (the heartbeat part of ocf:heartbeat:IPaddr2), run [source,C] ---- # pcs resource providers heartbeat linbit pacemaker redhat ---- Finally, if you want to see all the resource agents available for a specific ocf provider (the IPaddr2 part of ocf:heartbeat:IPaddr2), run [source,C] ---- # pcs resource agents ocf:heartbeat AoEtarget AudibleAlarm CTDB ClusterMon Delay Dummy . . (skipping lots of resources to save space) . IPaddr2 . . . symlink syslog-ng tomcat vmware ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] To obtain a list of the available resource classes, run [source,C] ---- # crm ra classes heartbeat lsb ocf / heartbeat pacemaker stonith ---- To then find all the OCF resource agents provided by Pacemaker and Heartbeat, run [source,C] ---- # crm ra list ocf pacemaker ClusterMon Dummy HealthCPU HealthSMART Stateful SysInfo SystemHealth controld o2cb ping pingd # crm ra list ocf heartbeat AoEtarget AudibleAlarm CTDB ClusterMon Delay Dummy EvmsSCC Evmsd Filesystem ICP IPaddr IPaddr2 IPsrcaddr IPv6addr LVM LinuxSCSI MailTo ManageRAID ManageVE Pure-FTPd Raid1 Route SAPDatabase SAPInstance SendArp ServeRAID SphinxSearchDaemon Squid Stateful SysInfo VIPArip VirtualDomain WAS WAS6 WinPopup Xen Xinetd anything apache conntrackd db2 drbd eDir88 ethmonitor exportfs fio iSCSILogicalUnit iSCSITarget ids iscsi jboss ldirectord lxc mysql mysql-proxy nfsserver nginx oracle oralsnr pgsql pingd portblock postfix proftpd rsyncd scsi2reservation sfex symlink syslog-ng tomcat vmware ---- endif::[] Now verify that the IP resource has been added and display the cluster's status to see that it is now active. ifdef::pcs[] [source,C] ---- # pcs status Last updated: Fri Sep 14 10:17:00 2012 Last change: Fri Sep 14 10:15:48 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-1 (1) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 1 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # crm configure show node $id="1702537408" pcmk-1 node $id="1719314624" pcmk-2 primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.120" cidr_netmask="32" \ op monitor interval="30s" property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" \ stonith-enabled="false" # crm_mon -1 ============ Last updated: Tue Apr 3 09:56:50 2012 Last change: Tue Apr 3 09:54:37 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-1 (1702537408) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 1 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 ---- endif::[] == Perform a Failover == Being a high-availability cluster, we should test failover of our new resource before moving on. First, find the node on which the IP address is running. ifdef::pcs[] [source,C] ---- # pcs status Last updated: Fri Sep 14 10:17:00 2012 Last change: Fri Sep 14 10:15:48 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-1 (1) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 1 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # crm resource status ClusterIP resource ClusterIP is running on: pcmk-1 ---- endif::[] Shut down Pacemaker and Corosync on that machine. ifdef::pcs[] [source,C] ---- #pcs cluster stop pcmk-1 Stopping Cluster... ---- Once Corosync is no longer running, go to the other node and check the cluster status. [source,C] ---- # pcs status Last updated: Fri Sep 14 10:31:01 2012 Last change: Fri Sep 14 10:15:48 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-2 (2) - partition WITHOUT quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 1 Resources configured. Online: [ pcmk-2 ] OFFLINE: [ pcmk-1 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Stopped ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # ssh pcmk-1 -- service pacemaker stop # ssh pcmk-1 -- service corosync stop ---- Once Corosync is no longer running, go to the other node and check the cluster status with crm_mon. [source,C] ---- # crm_mon -1 ============ Last updated: Tue Apr 3 10:01:28 2012 Last change: Tue Apr 3 09:54:39 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-2 (1719314624) - partition WITHOUT quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 1 Resources configured. ============ Online: [ pcmk-2 ] OFFLINE: [ pcmk-1 ] ---- endif::[] There are three things to notice about the cluster's current state. The first is that, as expected, +pcmk-1+ is now offline. However we can also see that +ClusterIP+ isn't running anywhere! === Quorum and Two-Node Clusters === This is because the cluster no longer has quorum, as can be seen by the text "partition WITHOUT quorum" in the status output. In order to reduce the possibility of data corruption, Pacemaker's default behavior is to stop all resources if the cluster does not have quorum. A cluster is said to have quorum when more than half the known or expected nodes are online, or for the mathematically inclined, whenever the following equation is true: .... total_nodes < 2 * active_nodes .... Therefore a two-node cluster only has quorum when both nodes are running, which is no longer the case for our cluster. This would normally make the creation of a two-node cluster pointless footnote:[Actually some would argue that two-node clusters are always pointless, but that is an argument for another time] , however it is possible to control how Pacemaker behaves when quorum is lost. In particular, we can tell the cluster to simply ignore quorum altogether. ifdef::pcs[] [source,C] ---- # pcs property set no-quorum-policy=ignore # pcs property dc-version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 cluster-infrastructure: corosync stonith-enabled: false no-quorum-policy: ignore ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # crm configure property no-quorum-policy=ignore # crm configure show node $id="1702537408" pcmk-1 node $id="1719314624" pcmk-2 primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.120" cidr_netmask="32" \ op monitor interval="30s" property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" \ stonith-enabled="false" \ no-quorum-policy="ignore" ---- endif::[] After a few moments, the cluster will start the IP address on the remaining node. Note that the cluster still does not have quorum. ifdef::pcs[] [source,C] ---- # pcs status Last updated: Fri Sep 14 10:38:11 2012 Last change: Fri Sep 14 10:37:53 2012 via cibadmin on pcmk-2 Stack: corosync Current DC: pcmk-2 (2) - partition WITHOUT quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 1 Resources configured. Online: [ pcmk-2 ] OFFLINE: [ pcmk-1 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # crm_mon -1 ============ Last updated: Tue Apr 3 10:02:46 2012 Last change: Tue Apr 3 10:02:08 2012 via cibadmin on pcmk-2 Stack: corosync Current DC: pcmk-2 (1719314624) - partition WITHOUT quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 1 Resources configured. ============ Online: [ pcmk-2 ] OFFLINE: [ pcmk-1 ] ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 ---- endif::[] Now simulate node recovery by restarting the cluster stack on +pcmk-1+ and check the cluster's status. Note, if you get an authentication error with the 'pcs cluster start pcmk-1' command, you must authenticate on the node using the 'pcs cluster auth pcmk pcmk-1 pcmk-2' command discussed earlier. ifdef::pcs[] [source,C] ---- # pcs cluster start pcmk-1 Starting Cluster... # pcs status Last updated: Fri Sep 14 10:42:56 2012 Last change: Fri Sep 14 10:37:53 2012 via cibadmin on pcmk-2 Stack: corosync Current DC: pcmk-2 (2) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 1 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # service corosync start Starting Corosync Cluster Engine (corosync): [ OK ] # service pacemaker start Starting Pacemaker Cluster Manager: [ OK ] # crm_mon ============ Last updated: Fri Aug 28 15:32:13 2009 Stack: openais Current DC: pcmk-2 - partition with quorum Version: 1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f 2 Nodes configured, 2 expected votes 1 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf::heartbeat:IPaddr): Started pcmk-2 ---- endif::[] [NOTE] ====== In the dark days, the cluster may have moved the IP back to its original location (+pcmk-1+). Usually this is no longer the case. ====== === Prevent Resources from Moving after Recovery === In most circumstances, it is highly desirable to prevent healthy resources from being moved around the cluster. Moving resources almost always requires a period of downtime. For complex services like Oracle databases, this period can be quite long. To address this, Pacemaker has the concept of resource stickiness which controls how much a service prefers to stay running where it is. You may like to think of it as the "cost" of any downtime. By default, Pacemaker assumes there is zero cost associated with moving resources and will do so to achieve "optimal" footnote:[It should be noted that Pacemaker's definition of optimal may not always agree with that of a human's. The order in which Pacemaker processes lists of resources and nodes creates implicit preferences in situations where the administrator has not explicitly specified them] resource placement. We can specify a different stickiness for every resource, but it is often sufficient to change the default. ifdef::pcs[] [source,C] ---- # pcs resource rsc defaults resource-stickiness=100 # pcs resource rsc defaults resource-stickiness: 100 ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # crm configure rsc_defaults resource-stickiness=100 # crm configure show node $id="1702537408" pcmk-1 node $id="1719314624" pcmk-2 primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.120" cidr_netmask="32" \ op monitor interval="30s" property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" \ stonith-enabled="false" \ no-quorum-policy="ignore" rsc_defaults $id="rsc-options" \ resource-stickiness="100" ---- endif::[] diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Apache.txt b/doc/Clusters_from_Scratch/en-US/Ch-Apache.txt index 4087af2d94..89ca042e41 100644 --- a/doc/Clusters_from_Scratch/en-US/Ch-Apache.txt +++ b/doc/Clusters_from_Scratch/en-US/Ch-Apache.txt @@ -1,794 +1,794 @@ = Apache - Adding More Services = == Forward == Now that we have a basic but functional active/passive two-node cluster, we're ready to add some real services. We're going to start with Apache because its a feature of many clusters and relatively simple to configure. == Installation == Before continuing, we need to make sure Apache is installed on both hosts. We also need the wget tool in order for the cluster to be able to check the status of the Apache server. [source,C] # yum install -y httpd wget ..... Loaded plugins: langpacks, presto, refresh-packagekit fedora/metalink | 2.6 kB 00:00 updates/metalink | 3.2 kB 00:00 updates-testing/metalink | 41 kB 00:00 Resolving Dependencies --> Running transaction check ---> Package httpd.x86_64 0:2.2.22-3.fc17 will be installed --> Processing Dependency: httpd-tools = 2.2.22-3.fc17 for package: httpd-2.2.22-3.fc17.x86_64 --> Processing Dependency: apr-util-ldap for package: httpd-2.2.22-3.fc17.x86_64 --> Processing Dependency: libaprutil-1.so.0()(64bit) for package: httpd-2.2.22-3.fc17.x86_64 --> Processing Dependency: libapr-1.so.0()(64bit) for package: httpd-2.2.22-3.fc17.x86_64 --> Running transaction check ---> Package apr.x86_64 0:1.4.6-1.fc17 will be installed ---> Package apr-util.x86_64 0:1.4.1-2.fc17 will be installed ---> Package apr-util-ldap.x86_64 0:1.4.1-2.fc17 will be installed ---> Package httpd-tools.x86_64 0:2.2.22-3.fc17 will be installed --> Finished Dependency Resolution Dependencies Resolved ===================================================================================== Package Arch Version Repository Size ===================================================================================== Installing: httpd x86_64 2.2.22-3.fc17 updates-testing 823 k wget x86_64 1.13.4-2.fc17 fedora 495 k Installing for dependencies: apr x86_64 1.4.6-1.fc17 fedora 99 k apr-util x86_64 1.4.1-2.fc17 fedora 78 k apr-util-ldap x86_64 1.4.1-2.fc17 fedora 17 k httpd-tools x86_64 2.2.22-3.fc17 updates-testing 74 k Transaction Summary ===================================================================================== Install 1 Package (+4 Dependent packages) Total download size: 1.1 M Installed size: 3.5 M Downloading Packages: (1/6): apr-1.4.6-1.fc17.x86_64.rpm | 99 kB 00:00 (2/6): apr-util-1.4.1-2.fc17.x86_64.rpm | 78 kB 00:00 (3/6): apr-util-ldap-1.4.1-2.fc17.x86_64.rpm | 17 kB 00:00 (4/6): httpd-2.2.22-3.fc17.x86_64.rpm | 823 kB 00:01 (5/6): httpd-tools-2.2.22-3.fc17.x86_64.rpm | 74 kB 00:00 (6/6): wget-1.13.4-2.fc17.x86_64.rpm | 495 kB 00:01 ------------------------------------------------------------------------------------- Total 238 kB/s | 1.1 MB 00:04 Running Transaction Check Running Transaction Test Transaction Test Succeeded Running Transaction Installing : apr-1.4.6-1.fc17.x86_64 1/6 Installing : apr-util-1.4.1-2.fc17.x86_64 2/6 Installing : apr-util-ldap-1.4.1-2.fc17.x86_64 3/6 Installing : httpd-tools-2.2.22-3.fc17.x86_64 4/6 Installing : httpd-2.2.22-3.fc17.x86_64 5/6 Installing : wget-1.13.4-2.fc17.x86_64 6/6 Verifying : apr-util-ldap-1.4.1-2.fc17.x86_64 1/6 Verifying : httpd-tools-2.2.22-3.fc17.x86_64 2/6 Verifying : apr-util-1.4.1-2.fc17.x86_64 3/6 Verifying : apr-1.4.6-1.fc17.x86_64 4/6 Verifying : httpd-2.2.22-3.fc17.x86_64 5/6 Verifying : wget-1.13.4-2.fc17.x86_64 6/6 Installed: httpd.x86_64 0:2.2.22-3.fc17 wget.x86_64 0:1.13.4-2.fc17 Dependency Installed: apr.x86_64 0:1.4.6-1.fc17 apr-util.x86_64 0:1.4.1-2.fc17 apr-util-ldap.x86_64 0:1.4.1-2.fc17 httpd-tools.x86_64 0:2.2.22-3.fc17 Complete! ..... == Preparation == First we need to create a page for Apache to serve up. On Fedora the default Apache docroot is /var/www/html, so we'll create an index file there. [source,C] ----- # cat <<-END >/var/www/html/index.html My Test Site - pcmk-1 END ----- For the moment, we will simplify things by serving up only a static site and manually sync the data between the two nodes. So run the command again on pcmk-2. [source,C] ----- [root@pcmk-2 ~]# cat <<-END >/var/www/html/index.html My Test Site - pcmk-2 END ----- == Enable the Apache status URL == In order to monitor the health of your Apache instance, and recover it if it fails, the resource agent used by Pacemaker assumes the server-status URL is available. Look for the following in '/etc/httpd/conf/httpd.conf' and make sure it is not disabled or commented out: [source,Apache Configuration] ----- SetHandler server-status Order deny,allow Deny from all Allow from 127.0.0.1 ----- == Update the Configuration == At this point, Apache is ready to go, all that needs to be done is to add it to the cluster. Lets call the resource WebSite. We need to use an OCF script called apache in the heartbeat namespace footnote:[Compare the key used here ocf:heartbeat:apache with the one we used earlier for the IP address: ocf:heartbeat:IPaddr2] , the only required parameter is the path to the main Apache configuration file and we'll tell the cluster to check once a minute that apache is still running. ifdef::pcs[] //// source,C doesn't deal well with \'s //// ----- pcs resource create WebSite ocf:heartbeat:apache \ configfile=/etc/httpd/conf/httpd.conf \ statusurl="http://localhost/server-status" op monitor interval=1min ----- By default, the operation timeout for all resource's start, stop, and monitor operations is 20 seconds. In many cases this timeout period is less than the advised timeout period. For the purposes of this tutorial, we will adjust the global operation timeout default to 240 seconds. [source,C] ----- # pcs resource op defaults timeout=240s # pcs resource op defaults timeout: 240s ----- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,Bash] ----- # crm configure primitive WebSite ocf:heartbeat:apache \ params configfile=/etc/httpd/conf/httpd.conf \ statusurl="http://localhost/server-status" \ op monitor interval=1min WARNING: WebSite: default timeout 20s for start is smaller than the advised 40s WARNING: WebSite: default timeout 20s for stop is smaller than the advised 60s ----- The easiest way resolve this, is to change the default: [source,Bash] ----- # crm configure op_defaults timeout=240s # crm configure show node $id="1702537408" pcmk-1 node $id="1719314624" pcmk-2 primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.120" cidr_netmask="32" \ op monitor interval="30s" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" \ stonith-enabled="false" \ no-quorum-policy="ignore" rsc_defaults $id="rsc-options" \ resource-stickiness="100" op_defaults $id="op-options" \ timeout="240s" ----- endif::[] After a short delay, we should see the cluster start apache ifdef::pcs[] [source,C] ----- # pcs status Last updated: Fri Sep 14 10:51:27 2012 Last change: Fri Sep 14 10:50:46 2012 via crm_attribute on pcmk-1 Stack: corosync Current DC: pcmk-2 (2) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 2 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 WebSite (ocf::heartbeat:apache): Started pcmk-1 ----- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ----- # crm_mon -1 ============ Last updated: Tue Apr 3 11:54:29 2012 Last change: Tue Apr 3 11:54:26 2012 via crmd on pcmk-1 Stack: corosync Current DC: pcmk-1 (1702537408) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 2 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf:heartbeat:IPaddr2): Started pcmk-2 WebSite (ocf:heartbeat:apache): Started pcmk-1 ----- endif::[] Wait a moment, the WebSite resource isn't running on the same host as our IP address! ifdef::pcs[] [NOTE] ====== If, in the `pcs status` output, you see the WebSite resource has failed to start, then you've likely not enabled the status URL correctly. You can check if this is the problem by running: .... wget http://127.0.0.1/server-status .... If you see +Connection refused+ in the output, then this is indeed the problem. Check to ensure that +Allow from 127.0.0.1+ is present for the ++ block. ====== endif::[] -ifdef::crm[] +ifdef::crmsh[] [NOTE] ====== If, in the `crm_mon` output, you see: .... Failed actions: WebSite_start_0 (node=pcmk-2, call=301, rc=1, status=complete): unknown error .... Then you've likely not enabled the status URL correctly. You can check if this is the problem by running: .... wget http://127.0.0.1/server-status .... If you see +Connection refused+ in the output, then this is indeed the problem. Check to ensure that +Allow from 127.0.0.1+ is present for the ++ block. ====== endif::[] == Ensuring Resources Run on the Same Host == To reduce the load on any one machine, Pacemaker will generally try to spread the configured resources across the cluster nodes. However we can tell the cluster that two resources are related and need to run on the same host (or not at all). Here we instruct the cluster that WebSite can only run on the host that ClusterIP is active on. ifdef::pcs[] To achieve this we use a colocation constraint that indicates it is mandatory for WebSite to run on the same node as ClusterIP. The "mandatory" part of the colocation constraint is indicated by using a score of INFINITY. The INFINITY score also means that if ClusterIP is not active anywhere, WebSite will not be permitted to run. endif::[] -ifdef::crm[] +ifdef::crmsh[] For the constraint, we need a name (choose something descriptive like website-with-ip), indicate that its mandatory (so that if ClusterIP is not active anywhere, WebSite will not be permitted to run anywhere either) by specifying a score of INFINITY and finally list the two resources. endif::[] [NOTE] ======= If ClusterIP is not active anywhere, WebSite will not be permitted to run anywhere. ======= [IMPORTANT] =========== Colocation constraints are "directional", in that they imply certain things about the order in which the two resources will have a location chosen. In this case we're saying +WebSite+ needs to be placed on the same machine as +ClusterIP+, this implies that we must know the location of +ClusterIP+ before choosing a location for +WebSite+. =========== ifdef::pcs[] [source,C] ----- # pcs constraint colocation add WebSite ClusterIP INFINITY # pcs constraint Location Constraints: Ordering Constraints: Colocation Constraints: WebSite with ClusterIP # pcs status Last updated: Fri Sep 14 11:00:44 2012 Last change: Fri Sep 14 11:00:25 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-2 (2) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 2 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 WebSite (ocf::heartbeat:apache): Started pcmk-2 ----- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ----- # crm configure colocation website-with-ip INFINITY: WebSite ClusterIP # crm configure show node $id="1702537408" pcmk-1 node $id="1719314624" pcmk-2 primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.120" cidr_netmask="32" \ op monitor interval="30s" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" colocation website-with-ip inf: WebSite ClusterIP property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" \ stonith-enabled="false" \ no-quorum-policy="ignore" \ last-lrm-refresh="1333446866" rsc_defaults $id="rsc-options" \ resource-stickiness="100" op_defaults $id="op-options" \ timeout="240s" # crm_mon -1 ============ Last updated: Tue Apr 3 11:57:13 2012 Last change: Tue Apr 3 11:56:10 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-2 (1719314624) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 2 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf:heartbeat:IPaddr2): Started pcmk-2 WebSite (ocf:heartbeat:apache): Started pcmk-2 ----- endif::[] == Controlling Resource Start/Stop Ordering == When Apache starts, it binds to the available IP addresses. It doesn't know about any addresses we add afterwards, so not only do they need to run on the same node, but we need to make sure ClusterIP is already active before we start WebSite. We do this by adding an ordering constraint. ifdef::pcs[] By default all order constraints are mandatory constraints unless otherwise configured. This means that the recovery of ClusterIP will also trigger the recovery of WebSite. [source,C] ----- # pcs constraint order ClusterIP then WebSite Adding ClusterIP WebSite (kind: Mandatory) (Options: first-action=start then-action=start) # pcs constraint Location Constraints: Ordering Constraints: start ClusterIP then start WebSite Colocation Constraints: WebSite with ClusterIP ----- endif::[] -ifdef::crm[] +ifdef::crmsh[] We need to give it a name (choose something descriptive like apache-after-ip), indicate that its mandatory (so that any recovery for ClusterIP will also trigger recovery of WebSite) and list the two resources in the order we need them to start. [source,C] ----- # crm configure order apache-after-ip mandatory: ClusterIP WebSite # crm configure show node $id="1702537408" pcmk-1 node $id="1719314624" pcmk-2 primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.120" cidr_netmask="32" \ op monitor interval="30s" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" colocation website-with-ip inf: WebSite ClusterIP order apache-after-ip inf: ClusterIP WebSite property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" \ stonith-enabled="false" \ no-quorum-policy="ignore" \ last-lrm-refresh="1333446866" rsc_defaults $id="rsc-options" \ resource-stickiness="100" op_defaults $id="op-options" \ timeout="240s" ----- endif::[] == Specifying a Preferred Location == Pacemaker does not rely on any sort of hardware symmetry between nodes, so it may well be that one machine is more powerful than the other. In such cases it makes sense to host the resources there if it is available. To do this we create a location constraint. ifdef::pcs[] In the location constraint below, we are saying the WebSite resource prefers the node pcmk-1 with a score of 50. The score here indicates how badly we'd like the resource to run somewhere. [source,C] ----- # pcs constraint location WebSite prefers pcmk-1=50 # pcs constraint Location Constraints: Resource: WebSite Enabled on: pcmk-1 (score:50) Ordering Constraints: start ClusterIP then start WebSite Colocation Constraints: WebSite with ClusterIP # pcs status Last updated: Fri Sep 14 11:06:37 2012 Last change: Fri Sep 14 11:06:26 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-2 (2) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 2 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 WebSite (ocf::heartbeat:apache): Started pcmk-2 ----- endif::[] -ifdef::crm[] +ifdef::crmsh[] Again we give it a descriptive name (prefer-pcmk-1), specify the resource we want to run there (WebSite), how badly we'd like it to run there (we'll use 50 for now, but in a two-node situation almost any value above 0 will do) and the host's name. [source,C] ----- # crm configure location prefer-pcmk-1 WebSite 50: pcmk-1 WARNING: prefer-pcmk-1: referenced node pcmk-1 does not exist ----- This warning should be ignored. [source,C] ----- # crm configure show node $id="1702537408" pcmk-1 node $id="1719314624" pcmk-2 primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.120" cidr_netmask="32" \ op monitor interval="30s" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" location prefer-pcmk-1 WebSite 50: pcmk-1 colocation website-with-ip inf: WebSite ClusterIP order apache-after-ip inf: ClusterIP WebSite property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" \ stonith-enabled="false" \ no-quorum-policy="ignore" \ last-lrm-refresh="1333446866" rsc_defaults $id="rsc-options" \ resource-stickiness="100" op_defaults $id="op-options" \ timeout="240s" # crm_mon -1 ============ Last updated: Tue Apr 3 12:02:14 2012 Last change: Tue Apr 3 11:59:42 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-2 (1719314624) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 2 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf:heartbeat:IPaddr2): Started pcmk-2 WebSite (ocf:heartbeat:apache): Started pcmk-2 ----- endif::[] Wait a minute, the resources are still on pcmk-2! Even though we now prefer pcmk-1 over pcmk-2, that preference is (intentionally) less than the resource stickiness (how much we preferred not to have unnecessary downtime). To see the current placement scores, you can use a tool called crm_simulate [source,C] ---- # crm_simulate -sL Current cluster status: Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf:heartbeat:IPaddr2): Started pcmk-2 WebSite (ocf:heartbeat:apache): Started pcmk-2 Allocation scores: native_color: ClusterIP allocation score on pcmk-1: 50 native_color: ClusterIP allocation score on pcmk-2: 200 native_color: WebSite allocation score on pcmk-1: -INFINITY native_color: WebSite allocation score on pcmk-2: 100 Transition Summary: ---- == Manually Moving Resources Around the Cluster == ifdef::pcs[] There are always times when an administrator needs to override the cluster and force resources to move to a specific location. By updating our previous location constraint with a score of INFINITY, WebSite will be forced to move to pcmk-1. [source,C] ----- # pcs constraint location WebSite prefers pcmk-1=INFINITY # pcs constraint all Location Constraints: Resource: WebSite Enabled on: pcmk-1 (score:INFINITY) (id:location-WebSite-pcmk-1-INFINITY) Ordering Constraints: start ClusterIP then start WebSite (Mandatory) (id:order-ClusterIP-WebSite-mandatory) Colocation Constraints: WebSite with ClusterIP (INFINITY) (id:colocation-WebSite-ClusterIP-INFINITY) # pcs status Last updated: Fri Sep 14 11:16:26 2012 Last change: Fri Sep 14 11:16:18 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-2 (2) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 2 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 WebSite (ocf::heartbeat:apache): Started pcmk-1 ----- endif::[] -ifdef::crm[] +ifdef::crmsh[] There are always times when an administrator needs to override the cluster and force resources to move to a specific location. Underneath we use location constraints like the one we created above, happily you don't need to care. Just provide the name of the resource and the intended location, we'll do the rest. [source,C] ----- # crm resource move WebSite pcmk-1 # crm_mon -1 ============ Last updated: Tue Apr 3 12:03:41 2012 Last change: Tue Apr 3 12:03:37 2012 via crm_resource on pcmk-1 Stack: corosync Current DC: pcmk-2 (1719314624) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 2 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf:heartbeat:IPaddr2): Started pcmk-1 WebSite (ocf:heartbeat:apache): Started pcmk-1 ----- Notice how the colocation rule we created has ensured that ClusterIP was also moved to pcmk-1. For the curious, we can see the effect of this command by examining the configuration [source,C] ----- # crm configure show node $id="1702537408" pcmk-1 node $id="1719314624" pcmk-2 primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.120" cidr_netmask="32" \ op monitor interval="30s" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" location cli-prefer-WebSite WebSite \ rule $id="cli-prefer-rule-WebSite" inf: #uname eq pcmk-1 location prefer-pcmk-1 WebSite 50: pcmk-1 colocation website-with-ip inf: WebSite ClusterIP order apache-after-ip inf: ClusterIP WebSite property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" \ stonith-enabled="false" \ no-quorum-policy="ignore" \ last-lrm-refresh="1333446866" rsc_defaults $id="rsc-options" \ resource-stickiness="100" op_defaults $id="op-options" \ timeout="240s" ----- The automated constraint used to move the resources to +pcmk-1+ is the line beginning with +location cli-prefer-WebSite+. endif::[] === Giving Control Back to the Cluster === Once we've finished whatever activity that required us to move the resources to pcmk-1, in our case nothing, we can then allow the cluster to resume normal operation with the unmove command. Since we previously configured a default stickiness, the resources will remain on pcmk-1. ifdef::pcs[] [source,C] ----- # pcs constraint all Location Constraints: Resource: WebSite Enabled on: pcmk-1 (score:INFINITY) (id:location-WebSite-pcmk-1-INFINITY) Ordering Constraints: start ClusterIP then start WebSite (Mandatory) (id:order-ClusterIP-WebSite-mandatory) Colocation Constraints: WebSite with ClusterIP (INFINITY) (id:colocation-WebSite-ClusterIP-INFINITY) # pcs constraint rm location-WebSite-pcmk-1-INFINITY # pcs constraint Location Constraints: Ordering Constraints: start ClusterIP then start WebSite Colocation Constraints: WebSite with ClusterIP ----- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ----- # crm resource unmove WebSite # crm configure show node $id="1702537408" pcmk-1 node $id="1719314624" pcmk-2 primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.120" cidr_netmask="32" \ op monitor interval="30s" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" location prefer-pcmk-1 WebSite 50: pcmk-1 colocation website-with-ip inf: WebSite ClusterIP order apache-after-ip inf: ClusterIP WebSite property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" \ stonith-enabled="false" \ no-quorum-policy="ignore" \ last-lrm-refresh="1333446866" rsc_defaults $id="rsc-options" \ resource-stickiness="100" op_defaults $id="op-options" \ timeout="240s" ----- endif::[] Note that the constraint is now gone. If we check the cluster status, we can also see that as expected the resources are still active on pcmk-1. ifdef::pcs[] [source,C] ----- # pcs status Last updated: Fri Sep 14 11:57:12 2012 Last change: Fri Sep 14 11:57:03 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-2 (2) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 2 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 WebSite (ocf::heartbeat:apache): Started pcmk-1 ----- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ----- # crm_mon ============ Last updated: Tue Apr 3 12:05:08 2012 Last change: Tue Apr 3 12:03:37 2012 via crm_resource on pcmk-1 Stack: corosync Current DC: pcmk-2 (1719314624) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 2 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf:heartbeat:IPaddr2): Started pcmk-1 WebSite (ocf:heartbeat:apache): Started pcmk-1 ----- endif::[] diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Installation.txt b/doc/Clusters_from_Scratch/en-US/Ch-Installation.txt index 537af14694..a320921990 100644 --- a/doc/Clusters_from_Scratch/en-US/Ch-Installation.txt +++ b/doc/Clusters_from_Scratch/en-US/Ch-Installation.txt @@ -1,1020 +1,1089 @@ = Installation = == OS Installation == Detailed instructions for installing Fedora are available at http://docs.fedoraproject.org/en-US/Fedora/17/html/Installation_Guide/ in a number of languages. The abbreviated version is as follows... Point your browser to http://fedoraproject.org/en/get-fedora-all, locate the +Install Media+ section and download the install DVD that matches your hardware. Burn the disk image to a DVD footnote:[http://docs.fedoraproject.org/en-US/Fedora/16/html/Burning_ISO_images_to_disc/index.html] and boot from it, or use the image to boot a virtual machine. After clicking through the welcome screen, select your language, keyboard layout footnote:[http://docs.fedoraproject.org/en-US/Fedora/16/html/Installation_Guide/sn-keyboard-x86.html] and storage type footnote:[http://docs.fedoraproject.org/en-US/Fedora/16/html/Installation_Guide/Storage_Devices-x86.html] Assign your machine a host name. footnote:[http://docs.fedoraproject.org/en-US/Fedora/16/html/Installation_Guide/sn-Netconfig-x86.html] I happen to control the clusterlabs.org domain name, so I will use that here. [IMPORTANT] =========== Do not accept the default network settings. Cluster machines should never obtain an IP address via DHCP. When you are presented with the +Configure Network+ advanced option, select that option before continuing with the installation process to specify a fixed IPv4 address for +System eth0+. Be sure to also enter the +Routes+ section and add an entry for your default gateway. image::images/Network.png["Custom network settings",align="center"] If you miss this step, this can easily be configured after installation. You will have to navigate to +system settings+ and select +network+. From there you can select what device to configure. =========== You will then be prompted to indicate the machine's physical location footnote:[http://docs.fedoraproject.org/en-US/Fedora/16/html/Installation_Guide/s1-timezone-x86.html] and to supply a root password. footnote:[http://docs.fedoraproject.org/en-US/Fedora/16/html/Installation_Guide/sn-account_configuration-x86.html] Now select where you want Fedora installed. footnote:[http://docs.fedoraproject.org/en-US/Fedora/16/html/Installation_Guide/s1-diskpartsetup-x86.html] As I don’t care about any existing data, I will accept the default and allow Fedora to use the complete drive. [IMPORTANT] =========== By default Fedora uses LVM for partitioning which allows us to dynamically change the amount of space allocated to a given partition. However, by default it also allocates all free space to the +/+ (aka. +root+) partition which cannot be dynamically _reduced_ in size (dynamic increases are fine by-the-way). So if you plan on following the DRBD or GFS2 portions of this guide, you should reserve at least 1Gb of space on each machine from which to create a shared volume. To do so select the +Review and modify partitioning layout+ checkbox before clicking +Next+. You will then be given an opportunity to reduce the size of the +root+ partition. =========== Next choose which software should be installed. footnote:[http://docs.fedoraproject.org/en-US/Fedora/16/html/Installation_Guide/s1-pkgselection-x86.html] Change the selection to Minimal so that we see everything that gets installed. Don't enable updates yet, we'll do that (and install any extra software we need) later. After you click next, Fedora will begin installing. Go grab something to drink, this may take a while. Once the node reboots, you'll see a (possibly mangled) login prompt on the console. Login using +root+ and the password you created earlier. image::images/Console.png["Initial Console",align="center"] [NOTE] ====== From here on in we're going to be working exclusively from the terminal. ====== == Post Installation Tasks == === Networking === Bring up the network and ensure it starts at boot [source,C] ----- # service network start # chkconfig network on ----- Check the machine has the static IP address you configured earlier [source,C] ----- # ip addr 1: lo: mtu 16436 qdisc noqueue state UNKNOWN link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo inet6 ::1/128 scope host valid_lft forever preferred_lft forever 2: eth0: mtu 1500 qdisc pfifo_fast state UP qlen 1000 link/ether 52:54:00:d7:d6:08 brd ff:ff:ff:ff:ff:ff inet 192.168.122.101/24 brd 192.168.122.255 scope global eth0 inet6 fe80::5054:ff:fed7:d608/64 scope link valid_lft forever preferred_lft forever ----- Now check the default route setting: [source,C] ----- [root@pcmk-1 ~]# ip route default via 192.168.122.1 dev eth0 192.168.122.0/24 dev eth0 proto kernel scope link src 192.168.122.101 ----- If there is no line beginning with +default via+, then you may need to add a line such as [source,Bash] GATEWAY=192.168.122.1 to '/etc/sysconfig/network' and restart the network. Now check for connectivity to the outside world. Start small by testing if we can read the gateway we configured. [source,C] ----- # ping -c 1 192.168.122.1 PING 192.168.122.1 (192.168.122.1) 56(84) bytes of data. 64 bytes from 192.168.122.1: icmp_req=1 ttl=64 time=0.249 ms --- 192.168.122.1 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 0.249/0.249/0.249/0.000 ms ----- Now try something external, choose a location you know will be available. [source,C] ----- # ping -c 1 www.google.com PING www.l.google.com (173.194.72.106) 56(84) bytes of data. 64 bytes from tf-in-f106.1e100.net (173.194.72.106): icmp_req=1 ttl=41 time=167 ms --- www.l.google.com ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 167.618/167.618/167.618/0.000 ms ----- === Leaving the Console === The console isn't a very friendly place to work from, we will now switch to accessing the machine remotely via SSH where we can use copy&paste etc. First we check we can see the newly installed at all: [source,C] ----- beekhof@f16 ~ # ping -c 1 192.168.122.101 PING 192.168.122.101 (192.168.122.101) 56(84) bytes of data. 64 bytes from 192.168.122.101: icmp_req=1 ttl=64 time=1.01 ms --- 192.168.122.101 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 1.012/1.012/1.012/0.000 ms ----- Next we login via SSH [source,C] ----- beekhof@f16 ~ # ssh -l root 192.168.122.11 root@192.168.122.11's password: Last login: Fri Mar 30 19:41:19 2012 from 192.168.122.1 [root@pcmk-1 ~]# ----- === Security Shortcuts === To simplify this guide and focus on the aspects directly connected to clustering, we will now disable the machine's firewall and SELinux installation. [WARNING] =========== Both of these actions create significant security issues and should not be performed on machines that will be exposed to the outside world. =========== [IMPORTANT] =========== TODO: Create an Appendix that deals with (at least) re-enabling the firewall. =========== [source,C] ---- # setenforce 0 # sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config # systemctl disable iptables.service # rm '/etc/systemd/system/basic.target.wants/iptables.service' # systemctl stop iptables.service ---- === Short Node Names === During installation, we filled in the machine's fully qualifier domain name (FQDN) which can be rather long when it appears in cluster logs and status output. See for yourself how the machine identifies itself: (((Nodes, short name))) [source,C] ---- # uname -n pcmk-1.clusterlabs.org # dnsdomainname clusterlabs.org ---- (((Nodes, Domain name (Query)))) The output from the second command is fine, but we really don't need the domain name included in the basic host details. To address this, we need to update /etc/sysconfig/network. This is what it should look like before we start. [source,C] ---- # cat /etc/sysconfig/network NETWORKING=yes HOSTNAME=pcmk-1.clusterlabs.org GATEWAY=192.168.122.1 ---- All we need to do now is strip off the domain name portion, which is stored elsewhere anyway. [source,C] ---- # sed -i.sed 's/\.[a-z].*//g' /etc/sysconfig/network ---- Now confirm the change was successful. The revised file contents should look something like this. [source,C] ---- # cat /etc/sysconfig/network NETWORKING=yes HOSTNAME=pcmk-1 GATEWAY=192.168.122.1 ---- However we're not finished. The machine wont normally see the shortened host name until about it reboots, but we can force it to update. [source,C] ---- # source /etc/sysconfig/network # hostname $HOSTNAME ---- (((Nodes, Domain name (Remove from host name)))) Now check the machine is using the correct names [source,C] ---- # uname -n pcmk-1 # dnsdomainname clusterlabs.org ---- === NTP === It is highly recommended to enable NTP on your cluster nodes. Doing so ensures all nodes agree on the current time and makes reading log files significantly easier. footnote:[http://docs.fedoraproject.org/en-US/Fedora/17/html-single/System_Administrators_Guide/index.html#ch-Configuring_the_Date_and_Time] == Before You Continue == Repeat the Installation steps so far, so that you have two Fedora nodes ready to have the cluster software installed. For the purposes of this document, the additional node is called pcmk-2 with address 192.168.122.102. === Finalize Networking === Confirm that you can communicate between the two new nodes: [source,C] ---- # ping -c 3 192.168.122.102 PING 192.168.122.102 (192.168.122.102) 56(84) bytes of data. 64 bytes from 192.168.122.102: icmp_seq=1 ttl=64 time=0.343 ms 64 bytes from 192.168.122.102: icmp_seq=2 ttl=64 time=0.402 ms 64 bytes from 192.168.122.102: icmp_seq=3 ttl=64 time=0.558 ms --- 192.168.122.102 ping statistics --- 3 packets transmitted, 3 received, 0% packet loss, time 2000ms rtt min/avg/max/mdev = 0.343/0.434/0.558/0.092 ms ---- Now we need to make sure we can communicate with the machines by their name. If you have a DNS server, add additional entries for the two machines. Otherwise, you'll need to add the machines to '/etc/hosts' . Below are the entries for my cluster nodes: [source,C] ---- # grep pcmk /etc/hosts 192.168.122.101 pcmk-1.clusterlabs.org pcmk-1 192.168.122.102 pcmk-2.clusterlabs.org pcmk-2 ---- We can now verify the setup by again using ping: [source,C] ---- # ping -c 3 pcmk-2 PING pcmk-2.clusterlabs.org (192.168.122.101) 56(84) bytes of data. 64 bytes from pcmk-1.clusterlabs.org (192.168.122.101): icmp_seq=1 ttl=64 time=0.164 ms 64 bytes from pcmk-1.clusterlabs.org (192.168.122.101): icmp_seq=2 ttl=64 time=0.475 ms 64 bytes from pcmk-1.clusterlabs.org (192.168.122.101): icmp_seq=3 ttl=64 time=0.186 ms --- pcmk-2.clusterlabs.org ping statistics --- 3 packets transmitted, 3 received, 0% packet loss, time 2001ms rtt min/avg/max/mdev = 0.164/0.275/0.475/0.141 ms ---- === Configure SSH === SSH is a convenient and secure way to copy files and perform commands remotely. For the purposes of this guide, we will create a key without a password (using the -N option) so that we can perform remote actions without being prompted. (((SSH))) [WARNING] ========= Unprotected SSH keys, those without a password, are not recommended for servers exposed to the outside world. We use them here only to simplify the demo. ========= Create a new key and allow anyone with that key to log in: .Creating and Activating a new SSH Key [source,C] ---- # ssh-keygen -t dsa -f ~/.ssh/id_dsa -N "" Generating public/private dsa key pair. Your identification has been saved in /root/.ssh/id_dsa. Your public key has been saved in /root/.ssh/id_dsa.pub. The key fingerprint is: 91:09:5c:82:5a:6a:50:08:4e:b2:0c:62:de:cc:74:44 root@pcmk-1.clusterlabs.org The key's randomart image is: +--[ DSA 1024]----+ |==.ooEo.. | |X O + .o o | | * A + | | + . | | . S | | | | | | | | | +-----------------+ # cp .ssh/id_dsa.pub .ssh/authorized_keys ---- (((Creating and Activating a new SSH Key))) Install the key on the other nodes and test that you can now run commands remotely, without being prompted .Installing the SSH Key on Another Host [source,C] ---- # scp -r .ssh pcmk-2: The authenticity of host 'pcmk-2 (192.168.122.102)' can't be established. RSA key fingerprint is b1:2b:55:93:f1:d9:52:2b:0f:f2:8a:4e:ae:c6:7c:9a. Are you sure you want to continue connecting (yes/no)? yes Warning: Permanently added 'pcmk-2,192.168.122.102' (RSA) to the list of known hosts.root@pcmk-2's password: id_dsa.pub 100% 616 0.6KB/s 00:00 id_dsa 100% 672 0.7KB/s 00:00 known_hosts 100% 400 0.4KB/s 00:00 authorized_keys 100% 616 0.6KB/s 00:00 # ssh pcmk-2 -- uname -n pcmk-2 # ---- == Cluster Software Installation == === Install the Cluster Software === Since version 12, Fedora comes with recent versions of everything you need, so simply fire up the shell and run: [source,C] ---- # yum install -y pacemaker corosync ---- ..... fedora/metalink | 38 kB 00:00 fedora | 4.2 kB 00:00 fedora/primary_db | 14 MB 00:21 updates/metalink | 2.7 kB 00:00 updates | 2.6 kB 00:00 updates/primary_db | 1.2 kB 00:00 updates-testing/metalink | 28 kB 00:00 updates-testing | 4.5 kB 00:00 updates-testing/primary_db | 4.5 MB 00:12 Setting up Install Process Resolving Dependencies --> Running transaction check ---> Package corosync.x86_64 0:1.99.9-1.fc17 will be installed --> Processing Dependency: corosynclib = 1.99.9-1.fc17 for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libxslt for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libvotequorum.so.5(COROSYNC_VOTEQUORUM_1.0)(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libquorum.so.5(COROSYNC_QUORUM_1.0)(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libcpg.so.4(COROSYNC_CPG_1.0)(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libcmap.so.4(COROSYNC_CMAP_1.0)(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libcfg.so.6(COROSYNC_CFG_0.82)(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libvotequorum.so.5()(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libtotem_pg.so.5()(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libquorum.so.5()(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libqb.so.0()(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libnetsnmp.so.30()(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libcpg.so.4()(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libcorosync_common.so.4()(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libcmap.so.4()(64bit) for package: corosync-1.99.9-1.fc17.x86_64 --> Processing Dependency: libcfg.so.6()(64bit) for package: corosync-1.99.9-1.fc17.x86_64 ---> Package pacemaker.x86_64 0:1.1.7-2.fc17 will be installed --> Processing Dependency: pacemaker-libs = 1.1.7-2.fc17 for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: pacemaker-cluster-libs = 1.1.7-2.fc17 for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: pacemaker-cli = 1.1.7-2.fc17 for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: resource-agents for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: perl(Getopt::Long) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libgnutls.so.26(GNUTLS_1_4)(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: cluster-glue for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: /usr/bin/perl for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libtransitioner.so.1()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libstonithd.so.1()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libstonith.so.1()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libplumb.so.2()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libpils.so.2()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libpengine.so.3()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libpe_status.so.3()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libpe_rules.so.2()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libltdl.so.7()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: liblrm.so.2()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libgnutls.so.26()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libcrmcommon.so.2()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libcrmcluster.so.1()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Processing Dependency: libcib.so.1()(64bit) for package: pacemaker-1.1.7-2.fc17.x86_64 --> Running transaction check ---> Package cluster-glue.x86_64 0:1.0.6-9.fc17.1 will be installed --> Processing Dependency: perl-TimeDate for package: cluster-glue-1.0.6-9.fc17.1.x86_64 --> Processing Dependency: libOpenIPMIutils.so.0()(64bit) for package: cluster-glue-1.0.6-9.fc17.1.x86_64 --> Processing Dependency: libOpenIPMIposix.so.0()(64bit) for package: cluster-glue-1.0.6-9.fc17.1.x86_64 --> Processing Dependency: libOpenIPMI.so.0()(64bit) for package: cluster-glue-1.0.6-9.fc17.1.x86_64 ---> Package cluster-glue-libs.x86_64 0:1.0.6-9.fc17.1 will be installed ---> Package corosynclib.x86_64 0:1.99.9-1.fc17 will be installed --> Processing Dependency: librdmacm.so.1(RDMACM_1.0)(64bit) for package: corosynclib-1.99.9-1.fc17.x86_64 --> Processing Dependency: libibverbs.so.1(IBVERBS_1.1)(64bit) for package: corosynclib-1.99.9-1.fc17.x86_64 --> Processing Dependency: libibverbs.so.1(IBVERBS_1.0)(64bit) for package: corosynclib-1.99.9-1.fc17.x86_64 --> Processing Dependency: librdmacm.so.1()(64bit) for package: corosynclib-1.99.9-1.fc17.x86_64 --> Processing Dependency: libibverbs.so.1()(64bit) for package: corosynclib-1.99.9-1.fc17.x86_64 ---> Package gnutls.x86_64 0:2.12.17-1.fc17 will be installed --> Processing Dependency: libtasn1.so.3(LIBTASN1_0_3)(64bit) for package: gnutls-2.12.17-1.fc17.x86_64 --> Processing Dependency: libtasn1.so.3()(64bit) for package: gnutls-2.12.17-1.fc17.x86_64 --> Processing Dependency: libp11-kit.so.0()(64bit) for package: gnutls-2.12.17-1.fc17.x86_64 ---> Package libqb.x86_64 0:0.11.1-1.fc17 will be installed ---> Package libtool-ltdl.x86_64 0:2.4.2-3.fc17 will be installed ---> Package libxslt.x86_64 0:1.1.26-9.fc17 will be installed ---> Package net-snmp-libs.x86_64 1:5.7.1-4.fc17 will be installed ---> Package pacemaker-cli.x86_64 0:1.1.7-2.fc17 will be installed ---> Package pacemaker-cluster-libs.x86_64 0:1.1.7-2.fc17 will be installed ---> Package pacemaker-libs.x86_64 0:1.1.7-2.fc17 will be installed ---> Package perl.x86_64 4:5.14.2-211.fc17 will be installed --> Processing Dependency: perl-libs = 4:5.14.2-211.fc17 for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(threads::shared) >= 1.21 for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(Socket) >= 1.3 for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(Scalar::Util) >= 1.10 for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(File::Spec) >= 0.8 for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl-macros for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl-libs for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(threads::shared) for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(threads) for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(Socket) for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(Scalar::Util) for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(Pod::Simple) for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(Module::Pluggable) for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(List::Util) for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(File::Spec::Unix) for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(File::Spec::Functions) for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(File::Spec) for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(Cwd) for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: perl(Carp) for package: 4:perl-5.14.2-211.fc17.x86_64 --> Processing Dependency: libperl.so()(64bit) for package: 4:perl-5.14.2-211.fc17.x86_64 ---> Package resource-agents.x86_64 0:3.9.2-2.fc17.1 will be installed --> Processing Dependency: /usr/sbin/rpc.nfsd for package: resource-agents-3.9.2-2.fc17.1.x86_64 --> Processing Dependency: /usr/sbin/rpc.mountd for package: resource-agents-3.9.2-2.fc17.1.x86_64 --> Processing Dependency: /usr/sbin/ethtool for package: resource-agents-3.9.2-2.fc17.1.x86_64 --> Processing Dependency: /sbin/rpc.statd for package: resource-agents-3.9.2-2.fc17.1.x86_64 --> Processing Dependency: /sbin/quotaon for package: resource-agents-3.9.2-2.fc17.1.x86_64 --> Processing Dependency: /sbin/quotacheck for package: resource-agents-3.9.2-2.fc17.1.x86_64 --> Processing Dependency: /sbin/mount.nfs4 for package: resource-agents-3.9.2-2.fc17.1.x86_64 --> Processing Dependency: /sbin/mount.nfs for package: resource-agents-3.9.2-2.fc17.1.x86_64 --> Processing Dependency: /sbin/mount.cifs for package: resource-agents-3.9.2-2.fc17.1.x86_64 --> Processing Dependency: /sbin/fsck.xfs for package: resource-agents-3.9.2-2.fc17.1.x86_64 --> Processing Dependency: libnet.so.1()(64bit) for package: resource-agents-3.9.2-2.fc17.1.x86_64 --> Running transaction check ---> Package OpenIPMI-libs.x86_64 0:2.0.18-13.fc17 will be installed ---> Package cifs-utils.x86_64 0:5.3-2.fc17 will be installed --> Processing Dependency: libtalloc.so.2(TALLOC_2.0.2)(64bit) for package: cifs-utils-5.3-2.fc17.x86_64 --> Processing Dependency: keyutils for package: cifs-utils-5.3-2.fc17.x86_64 --> Processing Dependency: libwbclient.so.0()(64bit) for package: cifs-utils-5.3-2.fc17.x86_64 --> Processing Dependency: libtalloc.so.2()(64bit) for package: cifs-utils-5.3-2.fc17.x86_64 ---> Package ethtool.x86_64 2:3.2-2.fc17 will be installed ---> Package libibverbs.x86_64 0:1.1.6-2.fc17 will be installed ---> Package libnet.x86_64 0:1.1.5-3.fc17 will be installed ---> Package librdmacm.x86_64 0:1.0.15-1.fc17 will be installed ---> Package libtasn1.x86_64 0:2.12-1.fc17 will be installed ---> Package nfs-utils.x86_64 1:1.2.5-12.fc17 will be installed --> Processing Dependency: rpcbind for package: 1:nfs-utils-1.2.5-12.fc17.x86_64 --> Processing Dependency: libtirpc for package: 1:nfs-utils-1.2.5-12.fc17.x86_64 --> Processing Dependency: libnfsidmap for package: 1:nfs-utils-1.2.5-12.fc17.x86_64 --> Processing Dependency: libgssglue.so.1(libgssapi_CITI_2)(64bit) for package: 1:nfs-utils-1.2.5-12.fc17.x86_64 --> Processing Dependency: libgssglue for package: 1:nfs-utils-1.2.5-12.fc17.x86_64 --> Processing Dependency: libevent for package: 1:nfs-utils-1.2.5-12.fc17.x86_64 --> Processing Dependency: libtirpc.so.1()(64bit) for package: 1:nfs-utils-1.2.5-12.fc17.x86_64 --> Processing Dependency: libnfsidmap.so.0()(64bit) for package: 1:nfs-utils-1.2.5-12.fc17.x86_64 --> Processing Dependency: libgssglue.so.1()(64bit) for package: 1:nfs-utils-1.2.5-12.fc17.x86_64 --> Processing Dependency: libevent-2.0.so.5()(64bit) for package: 1:nfs-utils-1.2.5-12.fc17.x86_64 ---> Package p11-kit.x86_64 0:0.12-1.fc17 will be installed ---> Package perl-Carp.noarch 0:1.22-2.fc17 will be installed ---> Package perl-Module-Pluggable.noarch 1:3.90-211.fc17 will be installed ---> Package perl-PathTools.x86_64 0:3.33-211.fc17 will be installed ---> Package perl-Pod-Simple.noarch 1:3.16-211.fc17 will be installed --> Processing Dependency: perl(Pod::Escapes) >= 1.04 for package: 1:perl-Pod-Simple-3.16-211.fc17.noarch ---> Package perl-Scalar-List-Utils.x86_64 0:1.25-1.fc17 will be installed ---> Package perl-Socket.x86_64 0:2.001-1.fc17 will be installed ---> Package perl-TimeDate.noarch 1:1.20-6.fc17 will be installed ---> Package perl-libs.x86_64 4:5.14.2-211.fc17 will be installed ---> Package perl-macros.x86_64 4:5.14.2-211.fc17 will be installed ---> Package perl-threads.x86_64 0:1.86-2.fc17 will be installed ---> Package perl-threads-shared.x86_64 0:1.40-2.fc17 will be installed ---> Package quota.x86_64 1:4.00-3.fc17 will be installed --> Processing Dependency: quota-nls = 1:4.00-3.fc17 for package: 1:quota-4.00-3.fc17.x86_64 --> Processing Dependency: tcp_wrappers for package: 1:quota-4.00-3.fc17.x86_64 ---> Package xfsprogs.x86_64 0:3.1.8-1.fc17 will be installed --> Running transaction check ---> Package keyutils.x86_64 0:1.5.5-2.fc17 will be installed ---> Package libevent.x86_64 0:2.0.14-2.fc17 will be installed ---> Package libgssglue.x86_64 0:0.3-1.fc17 will be installed ---> Package libnfsidmap.x86_64 0:0.25-1.fc17 will be installed ---> Package libtalloc.x86_64 0:2.0.7-4.fc17 will be installed ---> Package libtirpc.x86_64 0:0.2.2-2.1.fc17 will be installed ---> Package libwbclient.x86_64 1:3.6.3-81.fc17.1 will be installed ---> Package perl-Pod-Escapes.noarch 1:1.04-211.fc17 will be installed ---> Package quota-nls.noarch 1:4.00-3.fc17 will be installed ---> Package rpcbind.x86_64 0:0.2.0-16.fc17 will be installed ---> Package tcp_wrappers.x86_64 0:7.6-69.fc17 will be installed --> Finished Dependency Resolution Dependencies Resolved ===================================================================================== Package Arch Version Repository Size ===================================================================================== Installing: corosync x86_64 1.99.9-1.fc17 updates-testing 159 k pacemaker x86_64 1.1.7-2.fc17 updates-testing 362 k Installing for dependencies: OpenIPMI-libs x86_64 2.0.18-13.fc17 fedora 466 k cifs-utils x86_64 5.3-2.fc17 updates-testing 66 k cluster-glue x86_64 1.0.6-9.fc17.1 fedora 229 k cluster-glue-libs x86_64 1.0.6-9.fc17.1 fedora 121 k corosynclib x86_64 1.99.9-1.fc17 updates-testing 96 k ethtool x86_64 2:3.2-2.fc17 fedora 94 k gnutls x86_64 2.12.17-1.fc17 fedora 385 k keyutils x86_64 1.5.5-2.fc17 fedora 49 k libevent x86_64 2.0.14-2.fc17 fedora 160 k libgssglue x86_64 0.3-1.fc17 fedora 24 k libibverbs x86_64 1.1.6-2.fc17 fedora 44 k libnet x86_64 1.1.5-3.fc17 fedora 54 k libnfsidmap x86_64 0.25-1.fc17 fedora 34 k libqb x86_64 0.11.1-1.fc17 updates-testing 68 k librdmacm x86_64 1.0.15-1.fc17 fedora 27 k libtalloc x86_64 2.0.7-4.fc17 fedora 22 k libtasn1 x86_64 2.12-1.fc17 updates-testing 319 k libtirpc x86_64 0.2.2-2.1.fc17 fedora 78 k libtool-ltdl x86_64 2.4.2-3.fc17 fedora 45 k libwbclient x86_64 1:3.6.3-81.fc17.1 updates-testing 68 k libxslt x86_64 1.1.26-9.fc17 fedora 416 k net-snmp-libs x86_64 1:5.7.1-4.fc17 fedora 713 k nfs-utils x86_64 1:1.2.5-12.fc17 fedora 311 k p11-kit x86_64 0.12-1.fc17 updates-testing 36 k pacemaker-cli x86_64 1.1.7-2.fc17 updates-testing 368 k pacemaker-cluster-libs x86_64 1.1.7-2.fc17 updates-testing 77 k pacemaker-libs x86_64 1.1.7-2.fc17 updates-testing 322 k perl x86_64 4:5.14.2-211.fc17 fedora 10 M perl-Carp noarch 1.22-2.fc17 fedora 17 k perl-Module-Pluggable noarch 1:3.90-211.fc17 fedora 47 k perl-PathTools x86_64 3.33-211.fc17 fedora 105 k perl-Pod-Escapes noarch 1:1.04-211.fc17 fedora 40 k perl-Pod-Simple noarch 1:3.16-211.fc17 fedora 223 k perl-Scalar-List-Utils x86_64 1.25-1.fc17 updates-testing 33 k perl-Socket x86_64 2.001-1.fc17 updates-testing 44 k perl-TimeDate noarch 1:1.20-6.fc17 fedora 43 k perl-libs x86_64 4:5.14.2-211.fc17 fedora 628 k perl-macros x86_64 4:5.14.2-211.fc17 fedora 32 k perl-threads x86_64 1.86-2.fc17 fedora 47 k perl-threads-shared x86_64 1.40-2.fc17 fedora 36 k quota x86_64 1:4.00-3.fc17 fedora 160 k quota-nls noarch 1:4.00-3.fc17 fedora 74 k resource-agents x86_64 3.9.2-2.fc17.1 fedora 466 k rpcbind x86_64 0.2.0-16.fc17 fedora 52 k tcp_wrappers x86_64 7.6-69.fc17 fedora 72 k xfsprogs x86_64 3.1.8-1.fc17 updates-testing 715 k Transaction Summary ===================================================================================== Install 2 Packages (+46 Dependent packages) Total download size: 18 M Installed size: 59 M Downloading Packages: (1/48): OpenIPMI-libs-2.0.18-13.fc17.x86_64.rpm | 466 kB 00:00 warning: rpmts_HdrFromFdno: Header V3 RSA/SHA256 Signature, key ID 1aca3465: NOKEY Public key for OpenIPMI-libs-2.0.18-13.fc17.x86_64.rpm is not installed (2/48): cifs-utils-5.3-2.fc17.x86_64.rpm | 66 kB 00:01 Public key for cifs-utils-5.3-2.fc17.x86_64.rpm is not installed (3/48): cluster-glue-1.0.6-9.fc17.1.x86_64.rpm | 229 kB 00:00 (4/48): cluster-glue-libs-1.0.6-9.fc17.1.x86_64.rpm | 121 kB 00:00 (5/48): corosync-1.99.9-1.fc17.x86_64.rpm | 159 kB 00:01 (6/48): corosynclib-1.99.9-1.fc17.x86_64.rpm | 96 kB 00:00 (7/48): ethtool-3.2-2.fc17.x86_64.rpm | 94 kB 00:00 (8/48): gnutls-2.12.17-1.fc17.x86_64.rpm | 385 kB 00:00 (9/48): keyutils-1.5.5-2.fc17.x86_64.rpm | 49 kB 00:00 (10/48): libevent-2.0.14-2.fc17.x86_64.rpm | 160 kB 00:00 (11/48): libgssglue-0.3-1.fc17.x86_64.rpm | 24 kB 00:00 (12/48): libibverbs-1.1.6-2.fc17.x86_64.rpm | 44 kB 00:00 (13/48): libnet-1.1.5-3.fc17.x86_64.rpm | 54 kB 00:00 (14/48): libnfsidmap-0.25-1.fc17.x86_64.rpm | 34 kB 00:00 (15/48): libqb-0.11.1-1.fc17.x86_64.rpm | 68 kB 00:01 (16/48): librdmacm-1.0.15-1.fc17.x86_64.rpm | 27 kB 00:00 (17/48): libtalloc-2.0.7-4.fc17.x86_64.rpm | 22 kB 00:00 (18/48): libtasn1-2.12-1.fc17.x86_64.rpm | 319 kB 00:02 (19/48): libtirpc-0.2.2-2.1.fc17.x86_64.rpm | 78 kB 00:00 (20/48): libtool-ltdl-2.4.2-3.fc17.x86_64.rpm | 45 kB 00:00 (21/48): libwbclient-3.6.3-81.fc17.1.x86_64.rpm | 68 kB 00:00 (22/48): libxslt-1.1.26-9.fc17.x86_64.rpm | 416 kB 00:00 (23/48): net-snmp-libs-5.7.1-4.fc17.x86_64.rpm | 713 kB 00:01 (24/48): nfs-utils-1.2.5-12.fc17.x86_64.rpm | 311 kB 00:00 (25/48): p11-kit-0.12-1.fc17.x86_64.rpm | 36 kB 00:01 (26/48): pacemaker-1.1.7-2.fc17.x86_64.rpm | 362 kB 00:02 (27/48): pacemaker-cli-1.1.7-2.fc17.x86_64.rpm | 368 kB 00:02 (28/48): pacemaker-cluster-libs-1.1.7-2.fc17.x86_64.rpm | 77 kB 00:00 (29/48): pacemaker-libs-1.1.7-2.fc17.x86_64.rpm | 322 kB 00:01 (30/48): perl-5.14.2-211.fc17.x86_64.rpm | 10 MB 00:15 (31/48): perl-Carp-1.22-2.fc17.noarch.rpm | 17 kB 00:00 (32/48): perl-Module-Pluggable-3.90-211.fc17.noarch.rpm | 47 kB 00:00 (33/48): perl-PathTools-3.33-211.fc17.x86_64.rpm | 105 kB 00:00 (34/48): perl-Pod-Escapes-1.04-211.fc17.noarch.rpm | 40 kB 00:00 (35/48): perl-Pod-Simple-3.16-211.fc17.noarch.rpm | 223 kB 00:00 (36/48): perl-Scalar-List-Utils-1.25-1.fc17.x86_64.rpm | 33 kB 00:01 (37/48): perl-Socket-2.001-1.fc17.x86_64.rpm | 44 kB 00:00 (38/48): perl-TimeDate-1.20-6.fc17.noarch.rpm | 43 kB 00:00 (39/48): perl-libs-5.14.2-211.fc17.x86_64.rpm | 628 kB 00:00 (40/48): perl-macros-5.14.2-211.fc17.x86_64.rpm | 32 kB 00:00 (41/48): perl-threads-1.86-2.fc17.x86_64.rpm | 47 kB 00:00 (42/48): perl-threads-shared-1.40-2.fc17.x86_64.rpm | 36 kB 00:00 (43/48): quota-4.00-3.fc17.x86_64.rpm | 160 kB 00:00 (44/48): quota-nls-4.00-3.fc17.noarch.rpm | 74 kB 00:00 (45/48): resource-agents-3.9.2-2.fc17.1.x86_64.rpm | 466 kB 00:00 (46/48): rpcbind-0.2.0-16.fc17.x86_64.rpm | 52 kB 00:00 (47/48): tcp_wrappers-7.6-69.fc17.x86_64.rpm | 72 kB 00:00 (48/48): xfsprogs-3.1.8-1.fc17.x86_64.rpm | 715 kB 00:03 ---------------------------------------------------------------------------------------- Total 333 kB/s | 18 MB 00:55 Retrieving key from file:///etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-x86_64 Importing GPG key 0x1ACA3465: Userid : "Fedora (17) " Fingerprint: cac4 3fb7 74a4 a673 d81c 5de7 50e9 4c99 1aca 3465 Package : fedora-release-17-0.8.noarch (@anaconda-0) From : /etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-x86_64 Running Transaction Check Running Transaction Test Transaction Test Succeeded Running Transaction Installing : libqb-0.11.1-1.fc17.x86_64 1/48 Installing : libtool-ltdl-2.4.2-3.fc17.x86_64 2/48 Installing : cluster-glue-libs-1.0.6-9.fc17.1.x86_64 3/48 Installing : libxslt-1.1.26-9.fc17.x86_64 4/48 Installing : 1:perl-Pod-Escapes-1.04-211.fc17.noarch 5/48 Installing : perl-threads-1.86-2.fc17.x86_64 6/48 Installing : 4:perl-macros-5.14.2-211.fc17.x86_64 7/48 Installing : 1:perl-Pod-Simple-3.16-211.fc17.noarch 8/48 Installing : perl-Socket-2.001-1.fc17.x86_64 9/48 Installing : perl-Carp-1.22-2.fc17.noarch 10/48 Installing : 4:perl-libs-5.14.2-211.fc17.x86_64 11/48 Installing : perl-threads-shared-1.40-2.fc17.x86_64 12/48 Installing : perl-Scalar-List-Utils-1.25-1.fc17.x86_64 13/48 Installing : 1:perl-Module-Pluggable-3.90-211.fc17.noarch 14/48 Installing : perl-PathTools-3.33-211.fc17.x86_64 15/48 Installing : 4:perl-5.14.2-211.fc17.x86_64 16/48 Installing : libibverbs-1.1.6-2.fc17.x86_64 17/48 Installing : keyutils-1.5.5-2.fc17.x86_64 18/48 Installing : libgssglue-0.3-1.fc17.x86_64 19/48 Installing : libtirpc-0.2.2-2.1.fc17.x86_64 20/48 Installing : 1:net-snmp-libs-5.7.1-4.fc17.x86_64 21/48 Installing : rpcbind-0.2.0-16.fc17.x86_64 22/48 Installing : librdmacm-1.0.15-1.fc17.x86_64 23/48 Installing : corosynclib-1.99.9-1.fc17.x86_64 24/48 Installing : corosync-1.99.9-1.fc17.x86_64 25/48 error reading information on service corosync: No such file or directory Installing : 1:perl-TimeDate-1.20-6.fc17.noarch 26/48 Installing : 1:quota-nls-4.00-3.fc17.noarch 27/48 Installing : tcp_wrappers-7.6-69.fc17.x86_64 28/48 Installing : 1:quota-4.00-3.fc17.x86_64 29/48 Installing : libnfsidmap-0.25-1.fc17.x86_64 30/48 Installing : 1:libwbclient-3.6.3-81.fc17.1.x86_64 31/48 Installing : libnet-1.1.5-3.fc17.x86_64 32/48 Installing : 2:ethtool-3.2-2.fc17.x86_64 33/48 Installing : libevent-2.0.14-2.fc17.x86_64 34/48 Installing : 1:nfs-utils-1.2.5-12.fc17.x86_64 35/48 Installing : libtalloc-2.0.7-4.fc17.x86_64 36/48 Installing : cifs-utils-5.3-2.fc17.x86_64 37/48 Installing : libtasn1-2.12-1.fc17.x86_64 38/48 Installing : OpenIPMI-libs-2.0.18-13.fc17.x86_64 39/48 Installing : cluster-glue-1.0.6-9.fc17.1.x86_64 40/48 Installing : p11-kit-0.12-1.fc17.x86_64 41/48 Installing : gnutls-2.12.17-1.fc17.x86_64 42/48 Installing : pacemaker-libs-1.1.7-2.fc17.x86_64 43/48 Installing : pacemaker-cluster-libs-1.1.7-2.fc17.x86_64 44/48 Installing : pacemaker-cli-1.1.7-2.fc17.x86_64 45/48 Installing : xfsprogs-3.1.8-1.fc17.x86_64 46/48 Installing : resource-agents-3.9.2-2.fc17.1.x86_64 47/48 Installing : pacemaker-1.1.7-2.fc17.x86_64 48/48 Verifying : xfsprogs-3.1.8-1.fc17.x86_64 1/48 Verifying : 1:net-snmp-libs-5.7.1-4.fc17.x86_64 2/48 Verifying : corosync-1.99.9-1.fc17.x86_64 3/48 Verifying : cluster-glue-1.0.6-9.fc17.1.x86_64 4/48 Verifying : perl-PathTools-3.33-211.fc17.x86_64 5/48 Verifying : p11-kit-0.12-1.fc17.x86_64 6/48 Verifying : 1:perl-Pod-Simple-3.16-211.fc17.noarch 7/48 Verifying : OpenIPMI-libs-2.0.18-13.fc17.x86_64 8/48 Verifying : libtasn1-2.12-1.fc17.x86_64 9/48 Verifying : perl-threads-1.86-2.fc17.x86_64 10/48 Verifying : 1:perl-Pod-Escapes-1.04-211.fc17.noarch 11/48 Verifying : pacemaker-1.1.7-2.fc17.x86_64 12/48 Verifying : 4:perl-5.14.2-211.fc17.x86_64 13/48 Verifying : gnutls-2.12.17-1.fc17.x86_64 14/48 Verifying : perl-threads-shared-1.40-2.fc17.x86_64 15/48 Verifying : 4:perl-macros-5.14.2-211.fc17.x86_64 16/48 Verifying : 1:perl-Module-Pluggable-3.90-211.fc17.noarch 17/48 Verifying : 1:nfs-utils-1.2.5-12.fc17.x86_64 18/48 Verifying : cluster-glue-libs-1.0.6-9.fc17.1.x86_64 19/48 Verifying : pacemaker-libs-1.1.7-2.fc17.x86_64 20/48 Verifying : libtalloc-2.0.7-4.fc17.x86_64 21/48 Verifying : libevent-2.0.14-2.fc17.x86_64 22/48 Verifying : perl-Socket-2.001-1.fc17.x86_64 23/48 Verifying : libgssglue-0.3-1.fc17.x86_64 24/48 Verifying : perl-Carp-1.22-2.fc17.noarch 25/48 Verifying : libtirpc-0.2.2-2.1.fc17.x86_64 26/48 Verifying : 2:ethtool-3.2-2.fc17.x86_64 27/48 Verifying : 4:perl-libs-5.14.2-211.fc17.x86_64 28/48 Verifying : libxslt-1.1.26-9.fc17.x86_64 29/48 Verifying : rpcbind-0.2.0-16.fc17.x86_64 30/48 Verifying : librdmacm-1.0.15-1.fc17.x86_64 31/48 Verifying : resource-agents-3.9.2-2.fc17.1.x86_64 32/48 Verifying : 1:quota-4.00-3.fc17.x86_64 33/48 Verifying : 1:perl-TimeDate-1.20-6.fc17.noarch 34/48 Verifying : perl-Scalar-List-Utils-1.25-1.fc17.x86_64 35/48 Verifying : libtool-ltdl-2.4.2-3.fc17.x86_64 36/48 Verifying : pacemaker-cluster-libs-1.1.7-2.fc17.x86_64 37/48 Verifying : cifs-utils-5.3-2.fc17.x86_64 38/48 Verifying : libnet-1.1.5-3.fc17.x86_64 39/48 Verifying : corosynclib-1.99.9-1.fc17.x86_64 40/48 Verifying : libqb-0.11.1-1.fc17.x86_64 41/48 Verifying : 1:libwbclient-3.6.3-81.fc17.1.x86_64 42/48 Verifying : libnfsidmap-0.25-1.fc17.x86_64 43/48 Verifying : tcp_wrappers-7.6-69.fc17.x86_64 44/48 Verifying : keyutils-1.5.5-2.fc17.x86_64 45/48 Verifying : libibverbs-1.1.6-2.fc17.x86_64 46/48 Verifying : 1:quota-nls-4.00-3.fc17.noarch 47/48 Verifying : pacemaker-cli-1.1.7-2.fc17.x86_64 48/48 Installed: corosync.x86_64 0:1.99.9-1.fc17 pacemaker.x86_64 0:1.1.7-2.fc17 Dependency Installed: OpenIPMI-libs.x86_64 0:2.0.18-13.fc17 cifs-utils.x86_64 0:5.3-2.fc17 cluster-glue.x86_64 0:1.0.6-9.fc17.1 cluster-glue-libs.x86_64 0:1.0.6-9.fc17.1 corosynclib.x86_64 0:1.99.9-1.fc17 ethtool.x86_64 2:3.2-2.fc17 gnutls.x86_64 0:2.12.17-1.fc17 keyutils.x86_64 0:1.5.5-2.fc17 libevent.x86_64 0:2.0.14-2.fc17 libgssglue.x86_64 0:0.3-1.fc17 libibverbs.x86_64 0:1.1.6-2.fc17 libnet.x86_64 0:1.1.5-3.fc17 libnfsidmap.x86_64 0:0.25-1.fc17 libqb.x86_64 0:0.11.1-1.fc17 librdmacm.x86_64 0:1.0.15-1.fc17 libtalloc.x86_64 0:2.0.7-4.fc17 libtasn1.x86_64 0:2.12-1.fc17 libtirpc.x86_64 0:0.2.2-2.1.fc17 libtool-ltdl.x86_64 0:2.4.2-3.fc17 libwbclient.x86_64 1:3.6.3-81.fc17.1 libxslt.x86_64 0:1.1.26-9.fc17 net-snmp-libs.x86_64 1:5.7.1-4.fc17 nfs-utils.x86_64 1:1.2.5-12.fc17 p11-kit.x86_64 0:0.12-1.fc17 pacemaker-cli.x86_64 0:1.1.7-2.fc17 pacemaker-cluster-libs.x86_64 0:1.1.7-2.fc17 pacemaker-libs.x86_64 0:1.1.7-2.fc17 perl.x86_64 4:5.14.2-211.fc17 perl-Carp.noarch 0:1.22-2.fc17 perl-Module-Pluggable.noarch 1:3.90-211.fc17 perl-PathTools.x86_64 0:3.33-211.fc17 perl-Pod-Escapes.noarch 1:1.04-211.fc17 perl-Pod-Simple.noarch 1:3.16-211.fc17 perl-Scalar-List-Utils.x86_64 0:1.25-1.fc17 perl-Socket.x86_64 0:2.001-1.fc17 perl-TimeDate.noarch 1:1.20-6.fc17 perl-libs.x86_64 4:5.14.2-211.fc17 perl-macros.x86_64 4:5.14.2-211.fc17 perl-threads.x86_64 0:1.86-2.fc17 perl-threads-shared.x86_64 0:1.40-2.fc17 quota.x86_64 1:4.00-3.fc17 quota-nls.noarch 1:4.00-3.fc17 resource-agents.x86_64 0:3.9.2-2.fc17.1 rpcbind.x86_64 0:0.2.0-16.fc17 tcp_wrappers.x86_64 0:7.6-69.fc17 xfsprogs.x86_64 0:3.1.8-1.fc17 Complete! [root@pcmk-1 ~]# ..... Now install the cluster software on the second node. ifdef::pcs[] === Install the Cluster Management Software === The pcs cli command coupled with the pcs daemon creates a cluster management system capable of managing all aspects of the cluster stack across all nodes from a single location. [source,C] ---- # yum install -y pcs ---- Make sure to install the pcs packages on both nodes. endif::[] == Setup == ifdef::pcs[] === Enable pcs Daemon === Before the cluster can be configured, the pcs daemon must be started and enabled to boot on startup on each node. This daemon works with the pcs cli command to manage syncing the corosync configuration across all the nodes in the cluster. Start and enable the daemon by issuing the following commands on each node. [source,C] ---- # systemctl start pcsd.service # systemctl enable pcsd.service ---- Now setup a common pcs user account on each node in the cluster using the pcs_passwd command. In the example below, the user account 'pcmk' is created. You will be asked to supply a password (or supply one with the -p option). Make sure the username and password is consistent across all the nodes. [source,C] ---- # pcs_passwd pcmk password: ---- The pcs daemon account is required on each node to enable remote pcs command authentication. While the pcs cli command can be used locally without setting up a pcs daemon user account, access to pcs features that require access to remote nodes (such as syncing the corosync config, or starting/stopping the cluster on remote nodes) will be unavailable. This tutorial will make use of these remote access commands. endif::[] +ifdef::crmsh[] + +=== Preparation - Multicast === + +Choose a port number and +http://en.wikipedia.org/wiki/Multicast[multi-cast] address. +http://en.wikipedia.org/wiki/Multicast_address[] + +Be sure that the values you chose do not conflict with any existing +clusters you might have. For this document, I have chosen port '4000' +and used '239.255.1.1' as the multi-cast address. + +endif::[] + +=== Notes on Multicast Address Assignment === + +There are several subtle points that often deserve consideration when +choosing/assigning multicast addresses. +footnote:[This information is borrowed from, the now defunct, http://web.archive.org/web/20101211210054/http://29west.com/docs/THPM/multicast-address-assignment.html] + +. Avoid '224.0.0.x' ++ +Traffic to addresses of the form '224.0.0.x' is often flooded to all +switch ports. This address range is reserved for link-local uses. Many +routing protocols assume that all traffic within this range will be +received by all routers on the network. Hence (at least all Cisco) +switches flood traffic within this range. The flooding behavior +overrides the normal selective forwarding behavior of a +multicast-aware switch (e.g. IGMP snooping, CGMP, etc.). + +. Watch for '32:1' overlap ++ +32 non-contiguous IP multicast addresses are mapped onto each Ethernet +multicast address. A receiver that joins a single IP multicast group +implicitly joins 31 others due to this overlap. Of course, filtering +in the operating system discards undesired multicast traffic from +applications, but NIC bandwidth and CPU resources are nonetheless +consumed discarding it. The overlap occurs in the 5 high-order bits, +so it's best to use the 23 low-order bits to make distinct multicast +streams unique. For example, IP multicast addresses in the range +'239.0.0.0' to '239.127.255.255' all map to unique Ethernet multicast +addresses. However, IP multicast address '239.128.0.0' maps to the +same Ethernet multicast address as '239.0.0.0', '239.128.0.1' maps to +the same Ethernet multicast address as '239.0.0.1', etc. + +. Avoid 'x.0.0.y' and 'x.128.0.y' ++ +Combining the above two considerations, it's best to avoid using IP +multicast addresses of the form 'x.0.0.y' and 'x.128.0.y' since they +all map onto the range of Ethernet multicast addresses that are +flooded to all switch ports. + +. Watch for address assignment conflicts ++ +http://www.iana.org/[IANA] administers +http://www.iana.org/assignments/multicast-addresses[Internet multicast +addresses]. Potential conflicts with Internet multicast address +assignments can be avoided by using +http://www.ietf.org/rfc/rfc3180.txt[GLOP addressing] +(http://en.wikipedia.org/wiki/Autonomous_system_%28Internet%29[AS] +required) or http://www.ietf.org/rfc/rfc2365.txt[administratively +scoped] addresses. Such addresses can be safely used on a network +connected to the Internet without fear of conflict with multicast +sources originating on the Internet. Administratively scoped addresses +are roughly analogous to the unicast address space for +http://www.ietf.org/rfc/rfc1918.txt[private internets]. Site-local +multicast addresses are of the form '239.255.x.y', but can grow down +to '239.252.x.y' if needed. Organization-local multicast addresses are +of the form '239.192-251.x.y', but can grow down to '239.x.y.z' if +needed. + +For a more detailed treatment (57 pages!), see +http://www.cisco.com/en/US/tech/tk828/technologies_white_paper09186a00802d4643.shtml[Cisco's +Guidelines for Enterprise IP Multicast Address Allocation] paper. + === Configuring Corosync === ifdef::pcs[] In the past, at this point in the tutorial an explanation of how to configure and propagate corosync's /etc/corosync.conf file would be necessary. Using pcs with the pcs daemon greatly simplifies this process by generating the corosync.conf across all the nodes in the cluster with a single command. The only thing required to achieve this is to authenticate as the pcs user 'pcmk' on one of the nodes in the cluster, and then issue the 'pcs cluster setup' command with a list of all the node names in the cluster. [source,C] ---- # pcs cluster auth pcmk-1 pcmk-2 Username: pcmk Password: pcmk-1: Authorized pcmk-2: Authorized # pcs cluster setup mycluster pcmk-1 pcmk-2 pcmk-1: Succeeded pcmk-2: Succeeded ---- That's it. Corosync is configured across the cluster. If you received an authorization error for either of those commands, make sure you setup the 'pcmk' user account using the pcs_passwd command on every node in the cluster with the same password. endif::[] -ifdef::crm[] -Choose a port number and multi-cast footnote:[http://en.wikipedia.org/wiki/Multicast] address. footnote:[http://en.wikipedia.org/wiki/Multicast_address] -Be sure that the values you chose do not conflict with any existing clusters you might have. -For advice on choosing a multi-cast address, see -http://www.29west.com/docs/THPM/multicast-address-assignment.html -For this document, I have chosen port 4000 and used 239.255.1.1 as the multi-cast address. - +ifdef::crmsh[] [IMPORTANT] =========== The instructions below only apply for a machine with a single NIC. If you have a more complicated setup, you should edit the configuration manually. =========== [source,C] ---- # export ais_port=4000 # export ais_mcast=239.255.1.1 ---- Next we automatically determine the hosts address. By not using the full address, we make the configuration suitable to be copied to other nodes. [source,Bash] ---- -export ais_addr=`ip addr | grep "inet " | tail -n 1 | awk '{print $4}' | sed s/255/0/` +export ais_addr=`ip addr | grep "inet " | tail -n 1 | awk '{print $4}' | sed s/255/0/g` ---- Display and verify the configuration options [source,Bash] ---- # env | grep ais_ ais_mcast=239.255.1.1 ais_port=4000 ais_addr=192.168.122.0 ---- Once you're happy with the chosen values, update the Corosync configuration [source,C] ---- # cp /etc/corosync/corosync.conf.example /etc/corosync/corosync.conf # sed -i.bak "s/.*mcastaddr:.*/mcastaddr:\ $ais_mcast/g" /etc/corosync/corosync.conf # sed -i.bak "s/.*mcastport:.*/mcastport:\ $ais_port/g" /etc/corosync/corosync.conf # sed -i.bak "s/.*\tbindnetaddr:.*/bindnetaddr:\ $ais_addr/g" /etc/corosync/corosync.conf ---- Lastly, you'll need to enable quorum [source,Bash] ----- cat << END >> /etc/corosync/corosync.conf quorum { provider: corosync_votequorum expected_votes: 2 } END ----- endif::[] The final /etc/corosync.conf configuration on each node should look something like the sample in Appendix B, Sample Corosync Configuration. [IMPORTANT] =========== Pacemaker used to obtain membership and quorum from a custom Corosync plugin. This plugin also had the capability to start Pacemaker automatically when Corosync was started. Neither behavior is possible with Corosync 2.0 and beyond as support for plugins was removed. Instead, Pacemaker must be started as a separate job/initscript. Also, since Pacemaker made use of the plugin for message routing, a node using the plugin (Corosync prior to 2.0) cannot talk to one that isn't (Corosync 2.0+). Rolling upgrades between these versions are therefor not possible and an alternate strategy footnote:[http://www.clusterlabs.org/doc/en-US/Pacemaker/1.1/html/Pacemaker_Explained/ap-upgrade.html] must be used. =========== -ifdef::crm[] +ifdef::crmsh[] === Propagate the Configuration === Now we need to copy the changes so far to the other node: [source,C] ---- # for f in /etc/corosync/corosync.conf /etc/hosts; do scp $f pcmk-2:$f ; done corosync.conf 100% 1528 1.5KB/s 00:00 hosts 100% 281 0.3KB/s 00:00 # ---- endif::[] diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Shared-Storage.txt b/doc/Clusters_from_Scratch/en-US/Ch-Shared-Storage.txt index 00a65008c1..b83cdb9a10 100644 --- a/doc/Clusters_from_Scratch/en-US/Ch-Shared-Storage.txt +++ b/doc/Clusters_from_Scratch/en-US/Ch-Shared-Storage.txt @@ -1,778 +1,778 @@ = Replicated Storage with DRBD = == Background == Even if you're serving up static websites, having to manually synchronize the contents of that website to all the machines in the cluster is not ideal. For dynamic websites, such as a wiki, it's not even an option. Not everyone care afford network-attached storage but somehow the data needs to be kept in sync. Enter DRBD which can be thought of as network based RAID-1. See http://www.drbd.org/ for more details. == Install the DRBD Packages == Since its inclusion in the upstream 2.6.33 kernel, everything needed to use DRBD has shiped with Fedora since version 13. All you need to do is install it: [source,C] # yum install -y drbd-pacemaker drbd-udev ..... Loaded plugins: langpacks, presto, refresh-packagekit Resolving Dependencies --> Running transaction check ---> Package drbd-pacemaker.x86_64 0:8.3.11-5.fc17 will be installed --> Processing Dependency: drbd-utils = 8.3.11-5.fc17 for package: drbd-pacemaker-8.3.11-5.fc17.x86_64 ---> Package drbd-udev.x86_64 0:8.3.11-5.fc17 will be installed --> Running transaction check ---> Package drbd-utils.x86_64 0:8.3.11-5.fc17 will be installed --> Finished Dependency Resolution Dependencies Resolved ====================================================================================== Package Arch Version Repository Size ====================================================================================== Installing: drbd-pacemaker x86_64 8.3.11-5.fc17 updates-testing 22 k drbd-udev x86_64 8.3.11-5.fc17 updates-testing 6.4 k Installing for dependencies: drbd-utils x86_64 8.3.11-5.fc17 updates-testing 183 k Transaction Summary ====================================================================================== Install 2 Packages (+1 Dependent package) Total download size: 212 k Installed size: 473 k Downloading Packages: (1/3): drbd-pacemaker-8.3.11-5.fc17.x86_64.rpm | 22 kB 00:00 (2/3): drbd-udev-8.3.11-5.fc17.x86_64.rpm | 6.4 kB 00:00 (3/3): drbd-utils-8.3.11-5.fc17.x86_64.rpm | 183 kB 00:00 -------------------------------------------------------------------------------------- Total 293 kB/s | 212 kB 00:00 Running Transaction Check Running Transaction Test Transaction Test Succeeded Running Transaction Installing : drbd-utils-8.3.11-5.fc17.x86_64 1/3 Installing : drbd-pacemaker-8.3.11-5.fc17.x86_64 2/3 Installing : drbd-udev-8.3.11-5.fc17.x86_64 3/3 Verifying : drbd-pacemaker-8.3.11-5.fc17.x86_64 1/3 Verifying : drbd-udev-8.3.11-5.fc17.x86_64 2/3 Verifying : drbd-utils-8.3.11-5.fc17.x86_64 3/3 Installed: drbd-pacemaker.x86_64 0:8.3.11-5.fc17 drbd-udev.x86_64 0:8.3.11-5.fc17 Dependency Installed: drbd-utils.x86_64 0:8.3.11-5.fc17 Complete! ..... == Configure DRBD == Before we configure DRBD, we need to set aside some disk for it to use. === Create A Partition for DRBD === If you have more than 1Gb free, feel free to use it. For this guide however, 1Gb is plenty of space for a single html file and sufficient for later holding the GFS2 metadata. [source,C] ---- # vgdisplay | grep -e Name - e Free VG Name vg_pcmk1 Free PE / Size 31 / 992.00 MiB # lvs LV VG Attr LSize Pool Origin Data% Move Log Copy% Convert lv_root vg_pcmk1 -wi-ao-- 8.56g lv_swap vg_pcmk1 -wi-ao-- 960.00m # lvcreate -n drbd-demo -L 1G vg_pcmk1 Logical volume "drbd-demo" created # lvs LV VG Attr LSize Pool Origin Data% Move Log Copy% Convert drbd-demo vg_pcmk1 -wi-a--- 1.00G lv_root vg_pcmk1 -wi-ao-- 8.56g lv_swap vg_pcmk1 -wi-ao-- 960.00m ---- Repeat this on the second node, be sure to use the same size partition. [source,C] ---- # ssh pcmk-2 -- lvs LV VG Attr LSize Origin Snap% Move Log Copy% Convert lv_root vg_pcmk1 -wi-ao-- 8.56g lv_swap vg_pcmk1 -wi-ao-- 960.00m # ssh pcmk-2 -- lvcreate -n drbd-demo -L 1G vg_pcmk1 Logical volume "drbd-demo" created # ssh pcmk-2 -- lvs LV VG Attr LSize Origin Snap% Move Log Copy% Convert drbd-demo vg_pcmk1 -wi-a--- 1.00G lv_root vg_pcmk1 -wi-ao-- 8.56g lv_swap vg_pcmk1 -wi-ao-- 960.00m ---- === Write the DRBD Config === There is no series of commands for building a DRBD configuration, so simply copy the configuration below to /etc/drbd.conf Detailed information on the directives used in this configuration (and other alternatives) is available from http://www.drbd.org/users-guide/ch-configure.html [WARNING] ========= Be sure to use the names and addresses of your nodes if they differ from the ones used in this guide. ========= .... global { usage-count yes; } common { protocol C; } resource wwwdata { meta-disk internal; device /dev/drbd1; syncer { verify-alg sha1; } net { allow-two-primaries; } on pcmk-1 { disk /dev/vg_pcmk1/drbd-demo; address 192.168.122.101:7789; } on pcmk-2 { disk /dev/vg_pcmk1/drbd-demo; address 192.168.122.102:7789; } } .... [NOTE] ======= TODO: Explain the reason for the allow-two-primaries option ======= === Initialize and Load DRBD === With the configuration in place, we can now perform the DRBD initialization [source,C] ---- # drbdadm create-md wwwdata Writing meta data... initializing activity log NOT initialized bitmap New drbd meta data block successfully created. success ---- Now load the DRBD kernel module and confirm that everything is sane [source,C] ---- # modprobe drbd # drbdadm up wwwdata # cat /proc/drbd version: 8.3.11 (api:88/proto:86-96) srcversion: 0D2B62DEDB020A425130935 1: cs:Connected ro:Secondary/Secondary ds:Inconsistent/Inconsistent C r----- ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:1015740 ---- Repeat on the second node [source,C] ---- # ssh pcmk-2 -- drbdadm --force create-md wwwdata Writing meta data... initializing activity log NOT initialized bitmap New drbd meta data block successfully created. success # ssh pcmk-2 -- modprobe drbd WARNING: Deprecated config file /etc/modprobe.conf, all config files belong into /etc/modprobe.d/. # ssh pcmk-2 -- drbdadm up wwwdata # ssh pcmk-2 -- cat /proc/drbd version: 8.3.11 (api:88/proto:86-96) srcversion: 0D2B62DEDB020A425130935 1: cs:Connected ro:Secondary/Secondary ds:Inconsistent/Inconsistent C r----- ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:1015740 ---- Now we need to tell DRBD which set of data to use. Since both sides contain garbage, we can run the following on pcmk-1: [source,C] ---- # drbdadm -- --overwrite-data-of-peer primary wwwdata # cat /proc/drbd version: 8.3.11 (api:88/proto:86-96) srcversion: 0D2B62DEDB020A425130935 1: cs:SyncSource ro:Primary/Secondary ds:UpToDate/Inconsistent C r----- ns:8064 nr:0 dw:0 dr:8728 al:0 bm:0 lo:0 pe:1 ua:0 ap:0 ep:1 wo:f oos:1007804 [>....................] sync'ed: 0.9% (1007804/1015740)K finish: 0:12:35 speed: 1,320 (1,320) K/sec ---- After a while, the sync should finish and you'll see: [source,C] ---- # cat /proc/drbd version: 8.3.11 (api:88/proto:86-96) srcversion: 0D2B62DEDB020A425130935 1: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r----- ns:1015740 nr:0 dw:0 dr:1016404 al:0 bm:62 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:0 ---- pcmk-1 is now in the Primary state which allows it to be written to. Which means it's a good point at which to create a filesystem and populate it with some data to serve up via our WebSite resource. === Populate DRBD with Data === [source,C] ---- # mkfs.ext4 /dev/drbd1 mke2fs 1.42 (29-Nov-2011) Filesystem label= OS type: Linux Block size=4096 (log=2) Fragment size=4096 (log=2) Stride=0 blocks, Stripe width=0 blocks 63488 inodes, 253935 blocks 12696 blocks (5.00%) reserved for the super user First data block=0 Maximum filesystem blocks=260046848 8 block groups 32768 blocks per group, 32768 fragments per group 7936 inodes per group Superblock backups stored on blocks: 32768, 98304, 163840, 229376 Allocating group tables: done Writing inode tables: done Creating journal (4096 blocks): done Writing superblocks and filesystem accounting information: done ---- Now mount the newly created filesystem so we can create our index file [source,C] ---- # mount /dev/drbd1 /mnt/ # cat <<-END >/mnt/index.html My Test Site - drbd END # umount /dev/drbd1 ---- == Configure the Cluster for DRBD == ifdef::pcs[] One handy feature pcs has is the ability to queue up several changes into a file and commit those changes atomically. To do this, start by populating the file with the current raw xml config from the cib. This can be done using the following command. [source,C] ---- # pcs cluster cib drbd_cfg ---- Now using the pcs -f option, make changes to the configuration saved in the drbd_cfg file. These changes will not be seen by the cluster until the drbd_cfg file is pushed into the live cluster's cib later on. //// source,C doesn't do well with \'s //// ---- # pcs -f drbd_cfg resource create WebData ocf:linbit:drbd \ drbd_resource=wwwdata op monitor interval=60s # pcs -f drbd_cfg resource master WebDataClone WebData \ master-max=1 master-node-max=1 clone-max=2 clone-node-max=1 \ notify=true ---- [source,C] ---- # pcs -f drbd_cfg resource show ClusterIP (ocf::heartbeat:IPaddr2) Started WebSite (ocf::heartbeat:apache) Started Master/Slave Set: WebDataClone [WebData] Stopped: [ WebData:0 WebData:1 ] ---- After you are satisfied with all the changes, you can commit all the changes at once by pushing the drbd_cfg file into the live cib. [source,C] ---- # pcs cluster push cib drbd_cfg CIB updated # pcs status Last updated: Fri Sep 14 12:19:49 2012 Last change: Fri Sep 14 12:19:13 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-2 (2) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 4 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 WebSite (ocf::heartbeat:apache): Started pcmk-1 Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-1 ] Slaves: [ pcmk-2 ] ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] One handy feature of the crm shell is that you can use it in interactive mode to make several changes atomically. First we launch the shell. The prompt will change to indicate you're in interactive mode. [source,C] ---- # crm crm(live) # ---- Next we must create a working copy of the current configuration. This is where all our changes will go. The cluster will not see any of them until we say it's ok. Notice again how the prompt changes, this time to indicate that we're no longer looking at the live cluster. [source,C] ---- cib crm(live) # cib new drbd INFO: drbd shadow CIB created crm(drbd) # ---- Now we can create our DRBD clone and display the revised configuration. [source,C] ---- crm(drbd) # configure primitive WebData ocf:linbit:drbd params drbd_resource=wwwdata \ op monitor interval=60s crm(drbd) # configure ms WebDataClone WebData meta master-max=1 master-node-max=1 \ clone-max=2 clone-node-max=1 notify=true crm(drbd) # configure show node $id="1702537408" pcmk-1 node $id="1719314624" pcmk-2 primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.120" cidr_netmask="32" \ op monitor interval="30s" primitive WebData ocf:linbit:drbd \ params drbd_resource="wwwdata" \ op monitor interval="60s" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" ms WebDataClone WebData \ meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" location prefer-pcmk-1 WebSite 50: pcmk-1 colocation website-with-ip inf: WebSite ClusterIP order apache-after-ip inf: ClusterIP WebSite property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" \ stonith-enabled="false" \ no-quorum-policy="ignore" \ last-lrm-refresh="1333446866" rsc_defaults $id="rsc-options" \ resource-stickiness="100" op_defaults $id="op-options" \ timeout="240s" ---- Once we're happy with the changes, we can tell the cluster to start using them and use crm_mon to check everything is functioning. [source,C] ---- crm(drbd) # cib commit drbd INFO: commited 'drbd' shadow CIB to the cluster crm(drbd) # quit bye # crm_mon -1 ============ Last updated: Tue Apr 3 13:50:01 2012 Last change: Tue Apr 3 13:49:46 2012 via crm_shadow on pcmk-1 Stack: corosync Current DC: pcmk-1 (1702537408) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 4 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 WebSite (ocf::heartbeat:apache): Started pcmk-1 Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-1 ] Slaves: [ pcmk-2 ] ---- endif::[] [NOTE] ======= TODO: Include details on adding a second DRBD resource ======= Now that DRBD is functioning we can configure a Filesystem resource to use it. In addition to the filesystem's definition, we also need to tell the cluster where it can be located (only on the DRBD Primary) and when it is allowed to start (after the Primary was promoted). ifdef::pcs[] We are going to take a shortcut when creating the resource this time though. Instead of explicitly saying we want the 'ocf:heartbeat:Filesystem' script, we are only going to ask for 'Filesystem'. We can do this because we know there is only one resource script named 'Filesystem' available to pacemaker, and that pcs is smart enough to fill in the 'ocf:heartbeat' portion for us correctly in the configuration. If there were multiple 'Filesystem' scripts from different ocf providers, we would need to specify the exact one we wanted to use. Once again we will queue up our changes to a file and then push the new configuration to the cluster as the final step. ---- # pcs cluster cib fs_cfg # pcs -f fs_cfg resource create WebFS Filesystem \ device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" \ fstype="ext4" ---- [source,C] ---- # pcs -f fs_cfg constraint colocation add WebFS WebDataClone INFINITY with-rsc-role=Master # pcs -f fs_cfg constraint order promote WebDataClone then start WebFS Adding WebDataClone WebFS (kind: Mandatory) (Options: first-action=promote then-action=start) ---- We also need to tell the cluster that Apache needs to run on the same machine as the filesystem and that it must be active before Apache can start. [source,C] ---- # pcs -f fs_cfg constraint colocation add WebSite WebFS INFINITY # pcs -f fs_cfg constraint order WebFS then WebSite ---- Now review the updated configuration. [source,C] ---- # pcs -f fs_cfg constraint Location Constraints: Ordering Constraints: start ClusterIP then start WebSite WebFS then WebSite promote WebDataClone then start WebFS Colocation Constraints: WebSite with ClusterIP WebFS with WebDataClone (with-rsc-role:Master) WebSite with WebFS # pcs -f fs_cfg resource show ClusterIP (ocf::heartbeat:IPaddr2) Started WebSite (ocf::heartbeat:apache) Started Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-1 ] Slaves: [ pcmk-2 ] WebFS (ocf::heartbeat:Filesystem) Stopped ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] Once again we'll use the shell's interactive mode [source,C] ---- # crm crm(live) # cib new fs INFO: fs shadow CIB created crm(fs) # configure primitive WebFS ocf:heartbeat:Filesystem \ params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="ext4" crm(fs) # configure colocation fs_on_drbd inf: WebFS WebDataClone:Master crm(fs) # configure order WebFS-after-WebData inf: WebDataClone:promote WebFS:start ---- We also need to tell the cluster that Apache needs to run on the same machine as the filesystem and that it must be active before Apache can start. [source,C] ---- crm(fs) # configure colocation WebSite-with-WebFS inf: WebSite WebFS crm(fs) # configure order WebSite-after-WebFS inf: WebFS WebSite ---- Time to review the updated configuration: [source,C] ---- crm(fs) # configure show node $id="1702537408" pcmk-1 node $id="1719314624" pcmk-2 primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.120" cidr_netmask="32" \ op monitor interval="30s" primitive WebData ocf:linbit:drbd \ params drbd_resource="wwwdata" \ op monitor interval="60s" primitive WebFS ocf:heartbeat:Filesystem \ params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="ext4" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" ms WebDataClone WebData \ meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" location prefer-pcmk-1 WebSite 50: pcmk-1 colocation WebSite-with-WebFS inf: WebSite WebFS colocation fs_on_drbd inf: WebFS WebDataClone:Master colocation website-with-ip inf: WebSite ClusterIP order WebFS-after-WebData inf: WebDataClone:promote WebFS:start order WebSite-after-WebFS inf: WebFS WebSite order apache-after-ip inf: ClusterIP WebSite property $id="cib-bootstrap-options" \ dc-version="1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff" \ cluster-infrastructure="corosync" \ stonith-enabled="false" \ no-quorum-policy="ignore" \ last-lrm-refresh="1333446866" rsc_defaults $id="rsc-options" \ resource-stickiness="100" op_defaults $id="op-options" \ timeout="240s" ---- endif::[] After reviewing the new configuration, we again upload it and watch the cluster put it into effect. ifdef::pcs[] [source,C] ---- # pcs cluster push cib fs_cfg CIB updated # pcs status Last updated: Fri Aug 10 12:47:01 2012 Last change: Fri Aug 10 12:46:55 2012 via cibadmin on pcmk-1 Stack: corosync Current DC: pcmk-1 (1) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 5 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 WebSite (ocf::heartbeat:apache): Started pcmk-1 Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-1 ] Slaves: [ pcmk-2 ] WebFS (ocf::heartbeat:Filesystem): Started pcmk-1 ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- crm(fs) # cib commit fs INFO: commited 'fs' shadow CIB to the cluster crm(fs) # quit bye # crm_mon -1 ============ Last updated: Tue Apr 3 13:52:21 2012 Last change: Tue Apr 3 13:52:06 2012 via crm_shadow on pcmk-1 Stack: corosync Current DC: pcmk-1 (1702537408) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 5 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-1 WebSite (ocf::heartbeat:apache): Started pcmk-1 Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-1 ] Slaves: [ pcmk-2 ] WebFS (ocf::heartbeat:Filesystem): Started pcmk-1 ---- endif::[] === Testing Migration === We could shut down the active node again, but another way to safely simulate recovery is to put the node into what is called "standby mode". Nodes in this state tell the cluster that they are not allowed to run resources. Any resources found active there will be moved elsewhere. This feature can be particularly useful when updating the resources' packages. Put the local node into standby mode and observe the cluster move all the resources to the other node. Note also that the node's status will change to indicate that it can no longer host resources. ifdef::pcs[] [source,C] ---- # pcs cluster standby pcmk-1 # pcs status Last updated: Fri Sep 14 12:41:12 2012 Last change: Fri Sep 14 12:41:08 2012 via crm_attribute on pcmk-1 Stack: corosync Current DC: pcmk-1 (1) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 5 Resources configured. Node pcmk-1 (1): standby Online: [ pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 WebSite (ocf::heartbeat:apache): Started pcmk-2 Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-2 ] Stopped: [ WebData:1 ] WebFS (ocf::heartbeat:Filesystem): Started pcmk-2 ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # crm node standby # crm_mon -1 ============ Last updated: Tue Apr 3 13:59:14 2012 Last change: Tue Apr 3 13:52:36 2012 via crm_attribute on pcmk-1 Stack: corosync Current DC: pcmk-1 (1702537408) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 5 Resources configured. ============ Node pcmk-1 (1702537408): standby Online: [ pcmk-2 ] ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 WebSite (ocf::heartbeat:apache): Started pcmk-2 Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-2 ] Stopped: [ WebData:1 ] WebFS (ocf::heartbeat:Filesystem): Started pcmk-2 ---- endif::[] Once we've done everything we needed to on pcmk-1 (in this case nothing, we just wanted to see the resources move), we can allow the node to be a full cluster member again. ifdef::pcs[] [source,C] ---- # pcs cluster unstandby pcmk-1 # pcs status Last updated: Fri Sep 14 12:43:02 2012 Last change: Fri Sep 14 12:42:57 2012 via crm_attribute on pcmk-1 Stack: corosync Current DC: pcmk-1 (1) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 5 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 WebSite (ocf::heartbeat:apache): Started pcmk-2 Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-2 ] Slaves: [ pcmk-1 ] WebFS (ocf::heartbeat:Filesystem): Started pcmk-2 ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # crm node online # crm_mon -1 ============ Last updated: Tue Apr 3 14:00:06 2012 Last change: Tue Apr 3 14:00:00 2012 via crm_attribute on pcmk-1 Stack: corosync Current DC: pcmk-1 (1702537408) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 5 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ClusterIP (ocf::heartbeat:IPaddr2): Started pcmk-2 WebSite (ocf::heartbeat:apache): Started pcmk-2 Master/Slave Set: WebDataClone [WebData] Masters: [ pcmk-2 ] Slaves: [ pcmk-1 ] WebFS (ocf::heartbeat:Filesystem): Started pcmk-2 ---- endif::[] Notice that our resource stickiness settings prevent the services from migrating back to pcmk-1. diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Stonith.txt b/doc/Clusters_from_Scratch/en-US/Ch-Stonith.txt index dc37e905ee..123bd4b361 100644 --- a/doc/Clusters_from_Scratch/en-US/Ch-Stonith.txt +++ b/doc/Clusters_from_Scratch/en-US/Ch-Stonith.txt @@ -1,308 +1,308 @@ = Configure STONITH = == What Is STONITH == STONITH is an acronym for Shoot-The-Other-Node-In-The-Head and it protects your data from being corrupted by rogue nodes or concurrent access. Just because a node is unresponsive, this doesn't mean it isn't accessing your data. The only way to be 100% sure that your data is safe, is to use STONITH so we can be certain that the node is truly offline, before allowing the data to be accessed from another node. STONITH also has a role to play in the event that a clustered service cannot be stopped. In this case, the cluster uses STONITH to force the whole node offline, thereby making it safe to start the service elsewhere. == What STONITH Device Should You Use == It is crucial that the STONITH device can allow the cluster to differentiate between a node failure and a network one. The biggest mistake people make in choosing a STONITH device is to use remote power switch (such as many on-board IMPI controllers) that shares power with the node it controls. In such cases, the cluster cannot be sure if the node is really offline, or active and suffering from a network fault. Likewise, any device that relies on the machine being active (such as SSH-based "devices" used during testing) are inappropriate. == Configuring STONITH == ifdef::pcs[] . Find the correct driver: +pcs stonith list+ . Find the parameters associated with the device: +pcs stonith describe + . Create a local config to make changes to +pcs cluster cib stonith_cfg+ . Create the fencing resource using +pcs -f stonith_cfg stonith create [stonith device options]+ . Set stonith-enable to true. +pcs -f stonith_cfg property set stonith-enabled=true+ endif::[] -ifdef::crm[] +ifdef::crmsh[] . Find the correct driver: +stonith_admin --list-installed+ . Since every device is different, the parameters needed to configure it will vary. To find out the parameters associated with the device, run: +stonith_admin --metadata --agent type+ The output should be XML formatted text containing additional parameter descriptions. We will endevor to make the output more friendly in a later version. . Enter the shell crm Create an editable copy of the existing configuration +cib new stonith+ Create a fencing resource containing a primitive resource with a class of stonith, a type of type and a parameter for each of the values returned in step 2: +configure primitive ...+ endif::[] . If the device does not know how to fence nodes based on their uname, you may also need to set the special +pcmk_host_map+ parameter. See +man stonithd+ for details. . If the device does not support the list command, you may also need to set the special +pcmk_host_list+ and/or +pcmk_host_check+ parameters. See +man stonithd+ for details. . If the device does not expect the victim to be specified with the port parameter, you may also need to set the special +pcmk_host_argument+ parameter. See +man stonithd+ for details. -ifdef::crm[] +ifdef::crmsh[] . Upload it into the CIB from the shell: +cib commit stonith+ endif::[] ifdef::pcs[] . Commit the new configuration. +pcs cluster push cib stonith_cfg+ endif::[] . Once the stonith resource is running, you can test it by executing: +stonith_admin --reboot nodename+. Although you might want to stop the cluster on that machine first. == Example == Assuming we have an chassis containing four nodes and an IPMI device active on 10.0.0.1, then we would chose the fence_ipmilan driver in step 2 and obtain the following list of parameters .Obtaining a list of STONITH Parameters ifdef::pcs[] [source,C] ---- # pcs stonith describe fence_ipmilan Stonith options for: fence_ipmilan auth: IPMI Lan Auth type (md5, password, or none) ipaddr: IPMI Lan IP to talk to passwd: Password (if required) to control power on IPMI device passwd_script: Script to retrieve password (if required) lanplus: Use Lanplus login: Username/Login (if required) to control power on IPMI device action: Operation to perform. Valid operations: on, off, reboot, status, list, diag, monitor or metadata timeout: Timeout (sec) for IPMI operation cipher: Ciphersuite to use (same as ipmitool -C parameter) method: Method to fence (onoff or cycle) power_wait: Wait X seconds after on/off operation delay: Wait X seconds before fencing is started privlvl: Privilege level on IPMI device verbose: Verbose mode ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # stonith_admin --metadata -a fence_ipmilan ---- [source,XML] ---- fence_ipmilan is an I/O Fencing agent which can be used with machines controlled by IPMI. This agent calls support software using ipmitool (http://ipmitool.sf.net/). To use fence_ipmilan with HP iLO 3 you have to enable lanplus option (lanplus / -P) and increase wait after operation to 4 seconds (power_wait=4 / -T 4) IPMI Lan Auth type (md5, password, or none) IPMI Lan IP to talk to Password (if required) to control power on IPMI device Script to retrieve password (if required) Use Lanplus Username/Login (if required) to control power on IPMI device Operation to perform. Valid operations: on, off, reboot, status, list, diag, monitor or metadata Timeout (sec) for IPMI operation Ciphersuite to use (same as ipmitool -C parameter) Method to fence (onoff or cycle) Wait X seconds after on/off operation Wait X seconds before fencing is started Verbose mode ---- endif::[] from which we would create a STONITH resource fragment that might look like this .Sample STONITH Resource ifdef::pcs[] ---- # pcs cluster cib stonith_cfg # pcs -f stonith_cfg stonith create impi-fencing fence_ipmilan \ pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser \ passwd=acd123 op monitor interval=60s ---- [source,C] ---- # pcs -f stonith_cfg stonith impi-fencing (stonith:fence_ipmilan) Stopped ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # crm crm(live)# cib new stonith INFO: stonith shadow CIB created crm(stonith)# configure primitive impi-fencing stonith::fence_ipmilan \ params pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser passwd=abc123 \ op monitor interval="60s" ---- endif::[] And finally, since we disabled it earlier, we need to re-enable STONITH. At this point we should have the following configuration. ifdef::pcs[] [source,C] ---- # pcs -f stonith_cfg property set stonith-enabled=true # pcs -f stonith_cfg property dc-version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 cluster-infrastructure: corosync no-quorum-policy: ignore stonith-enabled: true ---- endif::[] Now push the configuration into the cluster. ifdef::pcs[] [source,C] ---- # pcs cluster push cib stonith_cfg ---- endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- crm(stonith)# configure property stonith-enabled="true" crm(stonith)# configure shownode pcmk-1 node pcmk-2 primitive WebData ocf:linbit:drbd \ params drbd_resource="wwwdata" \ op monitor interval="60s" primitive WebFS ocf:heartbeat:Filesystem \ params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ op monitor interval="30s"primitive ipmi-fencing stonith::fence_ipmilan \ params pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser passwd=abc123 \ op monitor interval="60s"ms WebDataClone WebData \ meta master-max="2" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" clone WebFSClone WebFS clone WebIP ClusterIP \ meta globally-unique="true" clone-max="2" clone-node-max="2" clone WebSiteClone WebSite colocation WebSite-with-WebFS inf: WebSiteClone WebFSClone colocation fs_on_drbd inf: WebFSClone WebDataClone:Master colocation website-with-ip inf: WebSiteClone WebIP order WebFS-after-WebData inf: WebDataClone:promote WebFSClone:start order WebSite-after-WebFS inf: WebFSClone WebSiteClone order apache-after-ip inf: WebIP WebSiteClone property $id="cib-bootstrap-options" \ dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ cluster-infrastructure="openais" \ expected-quorum-votes="2" \ stonith-enabled="true" \ no-quorum-policy="ignore" rsc_defaults $id="rsc-options" \ resource-stickiness="100" crm(stonith)# cib commit stonithINFO: commited 'stonith' shadow CIB to the cluster crm(stonith)# quit bye ---- endif::[] diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Tools.txt b/doc/Clusters_from_Scratch/en-US/Ch-Tools.txt index d3c255d3df..06229b09f7 100644 --- a/doc/Clusters_from_Scratch/en-US/Ch-Tools.txt +++ b/doc/Clusters_from_Scratch/en-US/Ch-Tools.txt @@ -1,160 +1,160 @@ = Pacemaker Tools = == Using Pacemaker Tools == In the dark past, configuring Pacemaker required the administrator to read and write XML. In true UNIX style, there were also a number of different commands that specialized in different aspects of querying and updating the cluster. All of that has been greatly simplified with the creation of unified command-line shells (and GUIs) that hide all the messy XML scaffolding. These shells take all the individual aspects required for managing and configuring a cluster, and packs them into one simple to use command line tool. They even allow you to queue up several changes at once and commit them atomically. There are currently two command-line shells that people use, `pcs` and -the `crm shell`. This edition of Clusters from Scratch is based on -{cli_name}. Start by taking some time to familiarize yourself with what it -can do. +`crmsh`. This edition of Clusters from Scratch is based on ++{cli_name}+. Start by taking some time to familiarize yourself with +what it can do. [NOTE] =========== The two shells share many concepts but the scope, layout and syntax does differ, so make sure you read the version of this guide that corresponds to the software installed on your system. =========== ifdef::pcs[] [IMPORTANT] =========== Since `pcs` has the ability to manage all aspects of the cluster (both corosync and pacemaker), it requires a specific cluster stack to be in use, (corosync 2.0 with votequorum + Pacemaker version >= 1.8). =========== [source,C] # pcs ..... Control and configure pacemaker and corosync. Options: -h Display usage and exit -f file Perform actions on file instead of active CIB Commands: resource Manage cluster resources cluster Configure cluster options and nodes stonith Configure fence devices property Set pacemaker properties constraint Set resource constraints status View cluster status ..... As you can see, the different aspects of cluster management are broken up into categories: resource, cluster, stonith, property, constraint, and status. To discover the functionality available in each of these categories, one can issue the command 'pcs help'. Below is an example of all the options available under the status category. [source,C] # pcs status help ..... Usage: pcs status [commands]... View current cluster and resource status Commands: status View all information about the cluster and resources status resources View current status of cluster resources status groups View currently configured groups and their resources status cluster View current cluster status status corosync View current corosync status status nodes [corosync] View current status of nodes from pacemaker, or if corosync is specified, print nodes currently configured in corosync status actions View failed actions status pcsd ... Show the current status of pcsd on the specified nodes status xml View xml version of status (output from crm_mon -r -1 -X) ..... Additionally, if you are interested in the Pacemaker version and supported cluster stack(s) available with your current Pacemaker installation, the pacemakerd --features option is available to you. [source,C] # pacemakerd --features ------------------ sys::[pacemakerd --features] ------------------ [NOTE] ====== If the SNMP and/or email options are not listed, then Pacemaker was not built to support them. This may be by the choice of your distribution or the required libraries may not have been available. Please contact whoever supplied you with the packages for more details. ====== endif::[] -ifdef::crm[] +ifdef::crmsh[] pass:[# crm --help] The primary tool for monitoring the status of the cluster is crm_mon (also available as crm status). It can be run in a variety of modes and has a number of output options. To find out about any of the tools that come with Pacemaker, simply invoke them with the --help option or consult the included man pages. Both sets of output are created from the tool, and so will always be in sync with each other and the tool itself. Additionally, the Pacemaker version and supported cluster stack(s) are available via the --feature option to pacemakerd. [source,C] # pacemakerd --features ------------------ sys::[pacemakerd --features] ------------------ [source,C] # crm_mon --help ------------------ sys::[crm_mon --help] ------------------ [NOTE] ====== If the SNMP and/or email options are not listed, then Pacemaker was not built to support them. This may be by the choice of your distribution or the required libraries may not have been available. Please contact whoever supplied you with the packages for more details. ====== endif::[] diff --git a/doc/Clusters_from_Scratch/en-US/Ch-Verification.txt b/doc/Clusters_from_Scratch/en-US/Ch-Verification.txt index 85adf2d884..7fcff9df63 100644 --- a/doc/Clusters_from_Scratch/en-US/Ch-Verification.txt +++ b/doc/Clusters_from_Scratch/en-US/Ch-Verification.txt @@ -1,293 +1,293 @@ = Verify Cluster Installation = ifdef::pcs[] == Start the Cluster == Now that corosync is configured, it is time to start the cluster. The command below will start corosync and pacemaker on both nodes in the cluster. If you are issuing the start command from a different node than the one you ran the 'pcs cluster auth' command on earlier, you must authenticate on current node you are logged into before you will be allowed to start the cluster. [source,C] ---- # pcs cluster start --all pcmk-1: Starting Cluster... pcmk-2: Starting Cluster... ---- An alternative to using the 'pcs cluster startall' command is to issue either of the below commands on each node in the cluster by hand. [source,C] ---- # pcs cluster start Starting Cluster... ---- or [source,C] ---- # systemctl start corosync.service # systemctl start pacemaker.service ---- endif::[] == Verify Corosync Installation == -ifdef::crm[] +ifdef::crmsh[] Start Corosync on the first node [source,C] ---- # systemctl start corosync.service ---- endif::[] The first thing to check is if cluster communication is happy, for that we use `corosync-cfgtool`. -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # corosync-cfgtool -s Printing ring status. Local node ID 1702537408 RING ID 0 id = 192.168.122.101 status = ring 0 active with no faults ---- endif::[] ifdef::pcs[] [source,C] ---- # corosync-cfgtool -s Printing ring status. Local node ID 1 RING ID 0 id = 192.168.122.101 status = ring 0 active with no faults ---- endif::[] We can see here that everything appears normal with our fixed IP address, not a 127.0.0.x loopback address, listed as the +id+ and +no faults+ for the status. If you see something different, you might want to start by checking the node's network, firewall and selinux configurations. Next we check the membership and quorum APIs: ifdef::pcs[] [source,C] ---- # corosync-cmapctl | grep members runtime.totem.pg.mrp.srp.members.1.ip (str) = r(0) ip(192.168.122.101) runtime.totem.pg.mrp.srp.members.1.join_count (u32) = 1 runtime.totem.pg.mrp.srp.members.1.status (str) = joined runtime.totem.pg.mrp.srp.members.2.ip (str) = r(0) ip(192.168.122.102) runtime.totem.pg.mrp.srp.members.2.join_count (u32) = 1 runtime.totem.pg.mrp.srp.members.2.status (str) = joined # pcs status corosync Membership information -------------------------- Nodeid Votes Name 1 1 pcmk-1 2 1 pcmk-2 ---- You should see both nodes have joined the cluster. endif::[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # corosync-cmapctl | grep members runtime.totem.pg.mrp.srp.members.1702537408.ip (str) = r(0) ip(192.168.122.101) runtime.totem.pg.mrp.srp.members.1702537408.join_count (u32) = 1 runtime.totem.pg.mrp.srp.members.1702537408.status (str) = joined # corosync-quorumtool -l Membership information -------------------------- Nodeid Votes Name 1702537408 1 pcmk-1 ---- The node see's itself in both locations which is a good sign. If the node list is empty when you call `corosync-quorumtool`, then you've not correctly quorum in 'corosync.conf'. With everything looking healthy, we start Corosync on the second node and run the same communications check. [source,C] ---- # ssh pcmk-2 -- systemctl start corosync.service # ssh pcmk-2 -- corosync-cfgtool -s Printing ring status. Local node ID 1719314624 RING ID 0 id = 192.168.122.102 status = ring 0 active with no faults ---- Everything appears to look ok from +pcmk-2+, time to re-run the membership and quorum checks to see if it shows up there too. Again, if you see something different to the above, check for the usual suspects: network, firewall and selinux. [source,C] ---- # corosync-cmapctl | grep members runtime.totem.pg.mrp.srp.members.1702537408.ip (str) = r(0) ip(192.168.122.101) runtime.totem.pg.mrp.srp.members.1702537408.join_count (u32) = 1 runtime.totem.pg.mrp.srp.members.1702537408.status (str) = joined runtime.totem.pg.mrp.srp.members.1719314624.ip (str) = r(0) ip(192.168.122.102) runtime.totem.pg.mrp.srp.members.1719314624.join_count (u32) = 1 runtime.totem.pg.mrp.srp.members.1719314624.status (str) = joined # corosync-quorumtool -l Membership information -------------------------- Nodeid Votes Name 1702537408 1 pcmk-1 1719314624 1 pcmk-2 ---- endif::[] All good! == Verify Pacemaker Installation == ifdef::pcs[] Now that we have confirmed that Corosync is functional we can check the rest of the stack. Pacemaker has already been started, so verify the necessary processes are running. [source,C] ---- # ps axf PID TTY STAT TIME COMMAND 2 ? S 0:00 [kthreadd] ...lots of processes... 28019 ? Ssl 0:03 /usr/sbin/corosync 28047 ? Ss 0:00 /usr/sbin/pacemakerd -f 28048 ? Ss 0:00 \_ /usr/libexec/pacemaker/cib 28049 ? Ss 0:00 \_ /usr/libexec/pacemaker/stonithd 28050 ? Ss 0:00 \_ /usr/lib64/heartbeat/lrmd 28051 ? Ss 0:00 \_ /usr/libexec/pacemaker/attrd 28052 ? Ss 0:00 \_ /usr/libexec/pacemaker/pengine 28053 ? Ss 0:00 \_ /usr/libexec/pacemaker/crmd ---- If that looks ok, check the pcs status output. [source,C] ---- # pcs status Last updated: Fri Sep 14 09:52:25 2012 Last change: Fri Sep 14 09:51:55 2012 via crmd on pcmk-2 Stack: corosync Current DC: pcmk-2 (2) - partition with quorum Version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 2 Nodes configured, unknown expected votes 0 Resources configured. Online: [ pcmk-1 pcmk-2 ] Full list of resources: ---- Next, check for any ERRORs during startup - there shouldn't be any. [source,C] ---- # grep -i error /var/log/messages ---- Repeat these checks on the other node. The results should be the same. endif::[] -ifdef::crm[] +ifdef::crmsh[] Now that we have confirmed that Corosync is functional we can check the rest of the stack. Start Pacemaker and check the necessary processes have been started. [source,C] ---- # systemctl start pacemaker.service # ps axf PID TTY STAT TIME COMMAND 2 ? S 0:00 [kthreadd] ...lots of processes... 28019 ? Ssl 0:03 /usr/sbin/corosync 28047 ? Ss 0:00 /usr/sbin/pacemakerd -f 28048 ? Ss 0:00 \_ /usr/libexec/pacemaker/cib 28049 ? Ss 0:00 \_ /usr/libexec/pacemaker/stonithd 28050 ? Ss 0:00 \_ /usr/lib64/heartbeat/lrmd 28051 ? Ss 0:00 \_ /usr/libexec/pacemaker/attrd 28052 ? Ss 0:00 \_ /usr/libexec/pacemaker/pengine 28053 ? Ss 0:00 \_ /usr/libexec/pacemaker/crmd ---- If that looks ok, check the logs and crm_mon. [source,C] ---- # grep pacemakerd /var/log/messages | grep -e get_cluster_type -e read_config Apr 3 09:19:32 pcmk-1 pacemakerd[28047]: info: get_cluster_type: Detected an active 'corosync' cluster Apr 3 09:19:32 pcmk-1 pacemakerd[28047]: info: read_config: Reading configure for stack: corosync # crm_mon -1 ============ Last updated: Tue Apr 3 09:21:37 2012 Last change: Tue Apr 3 09:19:54 2012 via crmd on pcmk-1 Stack: corosync Current DC: pcmk-1 (1702537408) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 1 Nodes configured, unknown expected votes 0 Resources configured. ============ Online: [ pcmk-1 ] ---- Next, check for any ERRORs during startup - there shouldn't be any. [source,C] ---- # grep -i error /var/log/messages ---- Repeat on the other node and display the cluster's status. [source,C] ---- # ssh pcmk-2 -- systemctl start pacemaker.service # crm_mon -1 ============ Last updated: Tue Apr 3 09:26:23 2012 Last change: Tue Apr 3 09:26:21 2012 via crmd on pcmk-1 Stack: corosync Current DC: pcmk-1 (1702537408) - partition with quorum Version: 1.1.7-2.fc17-ee0730e13d124c3d58f00016c3376a1de5323cff 2 Nodes configured, unknown expected votes 0 Resources configured. ============ Online: [ pcmk-1 pcmk-2 ] ---- endif::[] diff --git a/doc/Makefile.am b/doc/Makefile.am index 8c285cd481..4aa10c7419 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -1,197 +1,202 @@ # # doc: Pacemaker code # # Copyright (C) 2008 Andrew Beekhof # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in helpdir = $(datadir)/$(PACKAGE) ascii = crm_fencing.txt acls.txt docbook = Pacemaker_Explained Clusters_from_Scratch doc_DATA = README.hb2openais $(ascii) $(generated_docs) publican_docs = generated_docs = generated_mans = DOCBOOK_FORMATS := html-desktop DOCBOOK_LANGS := en-US DOTs = $(wildcard */en-US/images/*.dot) SVG = $(wildcard */en-US/images/pcmk-*.svg) $(DOTs:%.dot=%.svg) PNGS = $(SVG:%.svg=%-small.png) $(SVG:%.svg=%.png) $(SVG:%.svg=%-large.png) \ Pacemaker_Explained/en-US/images/Policy-Engine-big.png Pacemaker_Explained/en-US/images/Policy-Engine-small.png BRAND_PNGS = publican-clusterlabs/en-US/images/title_logo.png \ publican-clusterlabs/en-US/images/image_left.png \ publican-clusterlabs/en-US/images/image_right.png \ publican-clusterlabs/en-US/images/h1-bg.png graphics: $(PNGS) %.png: %.svg $(INKSCAPE) --file=$< --export-dpi=90 -C --export-png=$@ %-small.png: %.svg $(INKSCAPE) --file=$< --export-dpi=45 -C --export-png=$@ %-large.png: %.svg $(INKSCAPE) --file=$< --export-dpi=180 -C --export-png=$@ if BUILD_ASCIIDOC generated_docs += $(ascii:%.txt=%.html) if BUILD_DOCBOOK publican_docs += $(docbook) endif endif EXTRA_DIST = $(docbook:%=%.xml) %.html: %.txt $(ASCIIDOC) --unsafe --backend=xhtml11 $< %.8: %.8.txt a2x -L -f manpage $< # Build docbook from asciidoc because XML is a PITA to edit # # Build each chapter as a book (since the numbering isn't right for # articles and only books can have appendicies) and then strip out the # bits we don't want/need # %.xml: %.txt asciidoc -b docbook -a cli_name=$(ASCIIDOC_CLI_TYPE) -a $(ASCIIDOC_CLI_TYPE)=true -d book -o $@ $< sed -i.sed 's///' $@ sed -i.sed 's/ //' $@ # Fix line endings sed -i.sed 's/\ lang="en"//' $@ # Never specify a language in the chapters sed -i.sed 's/simpara/para/g' $@ # publican doesn't correctly render footnotes with simpara sed -i.sed 's/.*.*//g' $@ # Remove dangling tag sed -i.sed 's/.*preface>//g' $@ # Remove preface elements sed -i.sed 's:::g' $@ # Remove empty title sed -i.sed 's/chapter/section/g' $@ # Chapters become sections, so that books can become chapters sed -i.sed 's/<.*bookinfo.*>//g' $@ # Strip out bookinfo, we don't need it -grep -qis "//' $@ # We just want the appendix tag -grep -vqis "/chapter>/g' $@ # Rename to chapter echo Rebuilt $@ from $< CFS_TXT=$(wildcard Clusters_from_Scratch/en-US/*.txt) CFS_XML=$(CFS_TXT:%.txt=%.xml) # We have to hardcode the book name # With '%' the test for 'newness' fails Clusters_from_Scratch.build: $(PNGS) $(wildcard Clusters_from_Scratch/en-US/*.xml) $(CFS_XML) @echo Building $(@:%.build=%) because of $? rm -rf $(@:%.build=%)/publish/* cd $(@:%.build=%) && RPM_BUILD_DIR="" $(PUBLICAN) build --publish --langs=$(DOCBOOK_LANGS) --formats=$(DOCBOOK_FORMATS) rm -rf $(@:%.build=%)/tmp touch $@ PE_TXT=$(wildcard Pacemaker_Explained/en-US/*.txt) PE_XML=$(PE_TXT:%.txt=%.xml) # We have to hardcode the book name # With '%' the test for 'newness' fails Pacemaker_Explained.build: $(PNGS) $(wildcard Pacemaker_Explained/en-US/*.xml) $(PE_XML) @echo Building $(@:%.build=%) because of $? rm -rf $(@:%.build=%)/publish/* cd $(@:%.build=%) && RPM_BUILD_DIR="" $(PUBLICAN) build --publish --langs=$(DOCBOOK_LANGS) --formats=$(DOCBOOK_FORMATS) rm -rf $(@:%.build=%)/tmp touch $@ # Update the translation template pot: for book in $(docbook); do \ echo "Updating translation templates in: $$book"; \ ( cd $$book && RPM_BUILD_DIR="" $(PUBLICAN) update_pot ); \ done # Update the actual translations po: for book in $(docbook); do \ echo "Updating translations in: $$book"; \ ( cd $$book && RPM_BUILD_DIR="" $(PUBLICAN) update_po --langs=$(DOCBOOK_LANGS) );\ done if BUILD_DOCBOOK docbook_build = $(docbook:%=%.build) all-local: $(docbook_build) */publican.cfg #install-data-local: all-local install-data-local: all-local for book in $(docbook); do \ filelist=`find $$book/publish/* -print`; \ for f in $$filelist; do \ p=`echo $$f | sed s:publish/:: | sed s:Pacemaker/::`; \ if [ -d $$f ]; then \ $(INSTALL) -d 775 $(DESTDIR)$(docdir)/$$p; \ else \ $(INSTALL) -m 644 $$f $(DESTDIR)$(docdir)/$$p; \ fi \ done; \ done endif brand: $(BRAND_PNGS) $(wildcard publican-clusterlabs/en-US/*.xml) cd publican-clusterlabs && publican build --formats=xml --langs=all --publish echo "Installing..." cd publican-clusterlabs && sudo publican install_brand --path=$(datadir)/publican/Common_Content # find publican-clusterlabs -name "*.noarch.rpm" -exec rm -f \{\} \; # cd publican-clusterlabs && $(PUBLICAN) package --binary # find publican-clusterlabs -name "*.noarch.rpm" -exec sudo rpm -Uvh --force \{\} \; -www: $(generated_docs) www-pcs www-crm - echo rsync -rtz --progress $(generated_docs) $(ascii) root@www.clusterlabs.org:/var/www/html/doc/ - rsync -rtz --progress $(generated_docs) $(ascii) root@www.clusterlabs.org:/var/www/html/doc/ +pdf: + make DOCBOOK_FORMATS="pdf" ASCIIDOC_CLI_TYPE=$(ASCIIDOC_CLI_TYPE) all-local + +# Make sure www-(pcs|crmsh) happen in serial +www: + make www-pcs + make www-crmsh + make $(generated_docs) $(ascii) + rsync -rtz --progress $(generated_docs) $(ascii) $(asciiman) root@www.clusterlabs.org:/var/www/html/doc/ -www-crm: clean-local - make ASCIIDOC_CLI_TYPE=crm www-cli +www-crmsh: + make ASCIIDOC_CLI_TYPE=crmsh clean-local www-cli -www-pcs: clean-local - make ASCIIDOC_CLI_TYPE=pcs www-cli +www-pcs: + make ASCIIDOC_CLI_TYPE=pcs clean-local www-cli www-cli: for book in $(docbook); do \ sed -i.sed 's@brand:.*@brand: clusterlabs@' $$book/publican.cfg; \ sed -i.sed 's@version:.*@version: $(PACKAGE_SERIES)-$(ASCIIDOC_CLI_TYPE)@' $$book/publican.cfg; \ done - make DOCBOOK_FORMATS="pdf,html,html-single,epub" DOCBOOK_LANGS="all" ASCIIDOC_CLI_TYPE=$(ASCIIDOC_CLI_TYPE) all-local $(generated_docs) $(ascii) + make DOCBOOK_FORMATS="pdf,html,html-single,epub" DOCBOOK_LANGS="all" ASCIIDOC_CLI_TYPE=$(ASCIIDOC_CLI_TYPE) all-local echo Uploading current $(PACKAGE_SERIES)-$(ASCIIDOC_CLI_TYPE) documentation set to clusterlabs.org - rsync -rtz --progress $(generated_docs) $(ascii) $(asciiman) root@www.clusterlabs.org:/var/www/html/doc/ if BUILD_DOCBOOK for book in $(docbook); do \ echo Uploading $$book...; \ echo "Requires Corosync 2.x and optimized for the $(ASCIIDOC_CLI_TYPE) CLI
" > $$book/publish/build-$(PACKAGE_SERIES)-$(ASCIIDOC_CLI_TYPE).txt; \ echo "Generated on `date` from version: $(shell git log --pretty="format:%h %d" -n 1)" >> $$book/publish/build-$(PACKAGE_SERIES)-$(ASCIIDOC_CLI_TYPE).txt; \ for lang in `ls -1 $$book/publish | grep [a-z][a-z]-[A-Z][A-Z]`; do \ mv $$book/publish/$$lang/Pacemaker/$(PACKAGE_SERIES)-$(ASCIIDOC_CLI_TYPE)/epub/$$book/Pacemaker-1.1{-$(ASCIIDOC_CLI_TYPE),}-$$book-$$lang.epub; \ mv $$book/publish/$$lang/Pacemaker/$(PACKAGE_SERIES)-$(ASCIIDOC_CLI_TYPE)/pdf/$$book/Pacemaker-1.1{-$(ASCIIDOC_CLI_TYPE),}-$$book-$$lang.pdf; \ done; \ rsync -rtz --progress $$book/publish/* root@www.clusterlabs.org:/var/www/html/doc/; \ sed -i.sed 's@version:.*@version: $(PACKAGE_SERIES)@' $$book/publican.cfg; \ done endif clean-local: -rm -rf $(generated_docs) $(generated_mans) $(docbook_build) $(CFS_XML) $(PE_XML) for book in $(docbook); do rm -rf $$book/tmp $$book/publish; done foo: rm -f $(CFS_XML) diff --git a/doc/Pacemaker_Explained/en-US/Ap-OCF.txt b/doc/Pacemaker_Explained/en-US/Ap-OCF.txt index 72524483d0..4edccdd5c7 100644 --- a/doc/Pacemaker_Explained/en-US/Ap-OCF.txt +++ b/doc/Pacemaker_Explained/en-US/Ap-OCF.txt @@ -1,256 +1,256 @@ [appendix] [[ap-ocf]] == More About OCF Resource Agents == === Location of Custom Scripts === indexterm:[OCF Resource Agents] OCF Resource Agents are found in '/usr/lib/ocf/resource.d/+provider+'. When creating your own agents, you are encouraged to create a new directory under _/usr/lib/ocf/resource.d/_ so that they are not confused with (or overwritten by) the agents shipped with Heartbeat. So, for example, if you chose the provider name of bigCorp and wanted a new resource named bigApp, you would create a script called _/usr/lib/ocf/resource.d/bigCorp/bigApp_ and define a resource: [source,XML] === Actions === All OCF Resource Agents are required to implement the following actions .Required Actions for OCF Agents [width="95%",cols="3m,3,7",options="header",align="center"] |========================================================= |Action |Description |Instructions |start |Start the resource |Return 0 on success and an appropriate error code otherwise. Must not report success until the resource is fully active. indexterm:[start,OCF Action] indexterm:[OCF,Action,start] |stop |Stop the resource |Return 0 on success and an appropriate error code otherwise. Must not report success until the resource is fully stopped. indexterm:[stop,OCF Action] indexterm:[OCF,Action,stop] |monitor |Check the resource's state |Exit 0 if the resource is running, 7 if it is stopped, and anything else if it is failed. indexterm:[monitor,OCF Action] indexterm:[OCF,Action,monitor] NOTE: The monitor script should test the state of the resource on the local machine only. |meta-data |Describe the resource |Provide information about this resource as an XML snippet. Exit with 0. indexterm:[meta-data,OCF Action] indexterm:[OCF,Action,meta-data] NOTE: This is *not* performed as root. |validate-all |Verify the supplied parameters |Exit with 0 if parameters are valid, 2 if not valid, 6 if resource is not configured. indexterm:[validate-all,OCF Action] indexterm:[OCF,Action,validate-all] |========================================================= Additional requirements (not part of the OCF specs) are placed on agents that will be used for advanced concepts like <> and <> resources. .Optional Actions for OCF Agents [width="95%",cols="2m,6,3",options="header",align="center"] |========================================================= |Action |Description |Instructions |promote |Promote the local instance of a multi-state resource to the master/primary state. |Return 0 on success indexterm:[promote,OCF Action] indexterm:[OCF,Action,promote] |demote |Demote the local instance of a multi-state resource to the slave/secondary state. |Return 0 on success indexterm:[demote,OCF Action] indexterm:[OCF,Action,demote] |notify |Used by the cluster to send the agent pre and post notification events telling the resource what has happened and will happen. |Must not fail. Must exit with 0 indexterm:[notify,OCF Action] indexterm:[OCF,Action,notify] |========================================================= One action specified in the OCF specs is not currently used by the cluster: * +recover+ - a variant of the +start+ action, this should try to recover a resource locally. Remember to use indexterm:[ocf-tester]`ocf-tester` to verify that your new agent complies with the OCF standard properly. === How are OCF Return Codes Interpreted? === The first thing the cluster does is to check the return code against the expected result. If the result does not match the expected value, then the operation is considered to have failed and recovery action is initiated. There are three types of failure recovery: .Types of recovery performed by the cluster [width="95%",cols="1m,4,4",options="header",align="center"] |========================================================= |Type |Description |Action Taken by the Cluster |soft -indexterm:[soft,OCF error] -indexterm:[OCF,error,soft] |A transient error occurred |Restart the resource or move it to a new location +indexterm:[soft,OCF error] +indexterm:[OCF,error,soft] |hard -indexterm:[hard,OCF error] -indexterm:[OCF,error,hard] |A non-transient error that may be specific to the current node occurred |Move the resource elsewhere and prevent it from being retried on the current node +indexterm:[hard,OCF error] +indexterm:[OCF,error,hard] |fatal -indexterm:[fatal,OCF error] -indexterm:[OCF,error,fatal] |A non-transient error that will be common to all cluster nodes (eg. a bad configuration was specified) |Stop the resource and prevent it from being started on any cluster node +indexterm:[fatal,OCF error] +indexterm:[OCF,error,fatal] |========================================================= Assuming an action is considered to have failed, the following table outlines the different OCF return codes and the type of recovery the cluster will initiate when it is received. [[s-ocf-return-codes]] === OCF Return Codes === .OCF Return Codes and their Recovery Types [width="95%",cols="2m,5^m,6<,1m",options="header",align="center"] |========================================================= |RC |OCF Alias |Description |RT |0 |OCF_SUCCESS |Success. The command completed successfully. This is the expected result for all start, stop, promote and demote commands. indexterm:[Return Code,OCF_SUCCESS] indexterm:[Return Code,0,OCF_SUCCESS] |soft |1 |OCF_ERR_GENERIC |Generic "there was a problem" error code. indexterm:[Return Code,OCF_ERR_GENERIC] indexterm:[Return Code,1,OCF_ERR_GENERIC] |soft |2 |OCF_ERR_ARGS |The resource's configuration is not valid on this machine. Eg. refers to a location/tool not found on the node. indexterm:[Return Code,OCF_ERR_ARGS] indexterm:[Return Code,2,OCF_ERR_ARGS] |hard |3 |OCF_ERR_UNIMPLEMENTED |The requested action is not implemented. indexterm:[Return Code,OCF_ERR_UNIMPLEMENTED] indexterm:[Return Code,3,OCF_ERR_UNIMPLEMENTED] |hard |4 |OCF_ERR_PERM |The resource agent does not have sufficient privileges to complete the task. indexterm:[Return Code,OCF_ERR_PERM] indexterm:[Return Code,4,OCF_ERR_PERM] |hard |5 |OCF_ERR_INSTALLED |The tools required by the resource are not installed on this machine. indexterm:[Return Code,OCF_ERR_INSTALLED] indexterm:[Return Code,5,OCF_ERR_INSTALLED] |hard |6 |OCF_ERR_CONFIGURED |The resource's configuration is invalid. Eg. required parameters are missing. indexterm:[Return Code,OCF_ERR_CONFIGURED] indexterm:[Return Code,6,OCF_ERR_CONFIGURED] |fatal |7 |OCF_NOT_RUNNING |The resource is safely stopped. The cluster will not attempt to stop a resource that returns this for any action. indexterm:[Return Code,OCF_NOT_RUNNING] indexterm:[Return Code,7,OCF_NOT_RUNNING] |N/A |8 |OCF_RUNNING_MASTER |The resource is running in +Master+ mode. indexterm:[Return Code,OCF_RUNNING_MASTER] indexterm:[Return Code,8,OCF_RUNNING_MASTER] |soft |9 |OCF_FAILED_MASTER |The resource is in +Master+ mode but has failed. The resource will be demoted, stopped and then started (and possibly promoted) again. indexterm:[Return Code,OCF_FAILED_MASTER] indexterm:[Return Code,9,OCF_FAILED_MASTER] |soft |other |NA |Custom error code. indexterm:[Return Code,other] |soft |========================================================= Although counterintuitive, even actions that return 0 (aka. +OCF_SUCCESS+) can be considered to have failed. === Exceptions === * Non-recurring monitor actions (probes) that find a resource active (or in Master mode) will not result in recovery action unless it is also found active elsewhere * The recovery action taken when a resource is found active more than once is determined by the _multiple-active_ property of the resource * Recurring actions that return +OCF_ERR_UNIMPLEMENTED+ do not cause any type of recovery diff --git a/doc/Pacemaker_Explained/en-US/Ch-Basics.txt b/doc/Pacemaker_Explained/en-US/Ch-Basics.txt index 57c0167424..309ce9b295 100644 --- a/doc/Pacemaker_Explained/en-US/Ch-Basics.txt +++ b/doc/Pacemaker_Explained/en-US/Ch-Basics.txt @@ -1,368 +1,368 @@ = Configuration Basics = == Configuration Layout == The cluster is written using XML notation and divided into two main sections: configuration and status. The status section contains the history of each resource on each node and based on this data, the cluster can construct the complete current state of the cluster. The authoritative source for the status section is the local resource manager (lrmd) process on each cluster node and the cluster will occasionally repopulate the entire section. For this reason it is never written to disk and administrators are advised against modifying it in any way. The configuration section contains the more traditional information like cluster options, lists of resources and indications of where they should be placed. The configuration section is the primary focus of this document. The configuration section itself is divided into four parts: * Configuration options (called +crm_config+) * Nodes * Resources * Resource relationships (called +constraints+) .An empty configuration ====== [source,XML] ------- ------- ====== == The Current State of the Cluster == Before one starts to configure a cluster, it is worth explaining how to view the finished product. For this purpose we have created the `crm_mon` utility that will display the current state of an active cluster. It can show the cluster status by node or by resource and can be used in either single-shot or dynamically-updating mode. There are also modes for displaying a list of the operations performed (grouped by node and resource) as well as information about failures. Using this tool, you can examine the state of the cluster for irregularities and see how it responds when you cause or simulate failures. Details on all the available options can be obtained using the `crm_mon --help` command. .Sample output from crm_mon ====== ------- ============ Last updated: Fri Nov 23 15:26:13 2007 Current DC: sles-3 (2298606a-6a8c-499a-9d25-76242f7006ec) 3 Nodes configured. 5 Resources configured. ============ Node: sles-1 (1186dc9a-324d-425a-966e-d757e693dc86): online 192.168.100.181 (heartbeat::ocf:IPaddr): Started sles-1 192.168.100.182 (heartbeat:IPaddr): Started sles-1 192.168.100.183 (heartbeat::ocf:IPaddr): Started sles-1 rsc_sles-1 (heartbeat::ocf:IPaddr): Started sles-1 child_DoFencing:2 (stonith:external/vmware): Started sles-1 Node: sles-2 (02fb99a8-e30e-482f-b3ad-0fb3ce27d088): standby Node: sles-3 (2298606a-6a8c-499a-9d25-76242f7006ec): online rsc_sles-2 (heartbeat::ocf:IPaddr): Started sles-3 rsc_sles-3 (heartbeat::ocf:IPaddr): Started sles-3 child_DoFencing:0 (stonith:external/vmware): Started sles-3 ------- ====== .Sample output from crm_mon -n ====== ------- ============ Last updated: Fri Nov 23 15:26:13 2007 Current DC: sles-3 (2298606a-6a8c-499a-9d25-76242f7006ec) 3 Nodes configured. 5 Resources configured. ============ Node: sles-1 (1186dc9a-324d-425a-966e-d757e693dc86): online Node: sles-2 (02fb99a8-e30e-482f-b3ad-0fb3ce27d088): standby Node: sles-3 (2298606a-6a8c-499a-9d25-76242f7006ec): online Resource Group: group-1 192.168.100.181 (heartbeat::ocf:IPaddr): Started sles-1 192.168.100.182 (heartbeat:IPaddr): Started sles-1 192.168.100.183 (heartbeat::ocf:IPaddr): Started sles-1 rsc_sles-1 (heartbeat::ocf:IPaddr): Started sles-1 rsc_sles-2 (heartbeat::ocf:IPaddr): Started sles-3 rsc_sles-3 (heartbeat::ocf:IPaddr): Started sles-3 Clone Set: DoFencing child_DoFencing:0 (stonith:external/vmware): Started sles-3 child_DoFencing:1 (stonith:external/vmware): Stopped child_DoFencing:2 (stonith:external/vmware): Started sles-1 ------- ====== The DC (Designated Controller) node is where all the decisions are made and if the current DC fails a new one is elected from the remaining cluster nodes. The choice of DC is of no significance to an administrator beyond the fact that its logs will generally be more interesting. == How Should the Configuration be Updated? == There are three basic rules for updating the cluster configuration: * Rule 1 - Never edit the cib.xml file manually. Ever. I'm not making this up. * Rule 2 - Read Rule 1 again. * Rule 3 - The cluster will notice if you ignored rules 1 & 2 and refuse to use the configuration. Now that it is clear how NOT to update the configuration, we can begin to explain how you should. The most powerful tool for modifying the configuration is the +cibadmin+ command which talks to a running cluster. With +cibadmin+, the user can query, add, remove, update or replace any part of the configuration; all changes take effect immediately, so there is no need to perform a reload-like operation. The simplest way of using cibadmin is to use it to save the current configuration to a temporary file, edit that file with your favorite text or XML editor and then upload the revised configuration. .Safely using an editor to modify the cluster configuration ====== [source,C] -------- # cibadmin --query > tmp.xml # vi tmp.xml # cibadmin --replace --xml-file tmp.xml -------- ====== Some of the better XML editors can make use of a Relax NG schema to help make sure any changes you make are valid. The schema describing the configuration can normally be found in '/usr/lib/heartbeat/pacemaker.rng' on most systems. If you only wanted to modify the resources section, you could instead do .Safely using an editor to modify a subsection of the cluster configuration ====== [source,C] -------- # cibadmin --query --obj_type resources > tmp.xml # vi tmp.xml # cibadmin --replace --obj_type resources --xml-file tmp.xml -------- ====== to avoid modifying any other part of the configuration. == Quickly Deleting Part of the Configuration == Identify the object you wish to delete. Eg. run .Searching for STONITH related configuration items ====== [source,C] # cibadmin -Q | grep stonith [source,XML] -------- -------- ====== Next identify the resource's tag name and id (in this case we'll choose +primitive+ and +child_DoFencing+). Then simply execute: [source,C] -# cibadmin --delete --crm_xml '<primitive id="child_DoFencing"/>' +# cibadmin --delete --crm_xml '' == Updating the Configuration Without Using XML == Some common tasks can also be performed with one of the higher level tools that avoid the need to read or edit XML. To enable stonith for example, one could run: [source,C] # crm_attribute --attr-name stonith-enabled --attr-value true Or, to see if +somenode+ is allowed to run resources, there is: [source,C] # crm_standby --get-value --node-uname somenode Or, to find the current location of +my-test-rsc+, one can use: [source,C] # crm_resource --locate --resource my-test-rsc [[s-config-sandboxes]] == Making Configuration Changes in a Sandbox == Often it is desirable to preview the effects of a series of changes before updating the configuration atomically. For this purpose we have created `crm_shadow` which creates a "shadow" copy of the configuration and arranges for all the command line tools to use it. To begin, simply invoke `crm_shadow` and give it the name of a configuration to create footnote:[Shadow copies are identified with a name, making it possible to have more than one.] ; be sure to follow the simple on-screen instructions. WARNING: Read the above carefully, failure to do so could result in you destroying the cluster's active configuration! .Creating and displaying the active sandbox ====== [source,Bash] -------- # crm_shadow --create test Setting up shadow instance Type Ctrl-D to exit the crm_shadow shell shadow[test]: shadow[test] # crm_shadow --which test -------- ====== From this point on, all cluster commands will automatically use the shadow copy instead of talking to the cluster's active configuration. Once you have finished experimenting, you can either commit the changes, or discard them as shown below. Again, be sure to follow the on-screen instructions carefully. For a full list of `crm_shadow` options and commands, invoke it with the --help option. .Using a sandbox to make multiple changes atomically ====== [source,Bash] -------- shadow[test] # crm_failcount -G -r rsc_c001n01 name=fail-count-rsc_c001n01 value=0 shadow[test] # crm_standby -v on -n c001n02 shadow[test] # crm_standby -G -n c001n02 name=c001n02 scope=nodes value=on shadow[test] # cibadmin --erase --force shadow[test] # cibadmin --query shadow[test] # crm_shadow --delete test --force Now type Ctrl-D to exit the crm_shadow shell shadow[test] # exit # crm_shadow --which No shadow instance provided # cibadmin -Q -------- ====== Making changes in a sandbox and verifying the real configuration is untouched [[s-config-testing-changes]] == Testing Your Configuration Changes == We saw previously how to make a series of changes to a "shadow" copy of the configuration. Before loading the changes back into the cluster (eg. `crm_shadow --commit mytest --force`), it is often advisable to simulate the effect of the changes with +crm_simulate+, eg. [source,C] # crm_simulate --live-check -VVVVV --save-graph tmp.graph --save-dotfile tmp.dot The tool uses the same library as the live cluster to show what it would have done given the supplied input. It's output, in addition to a significant amount of logging, is stored in two files +tmp.graph+ and +tmp.dot+, both are representations of the same thing -- the cluster's response to your changes. In the graph file is stored the complete transition, containing a list of all the actions, their parameters and their pre-requisites. Because the transition graph is not terribly easy to read, the tool also generates a Graphviz dot-file representing the same information. == Interpreting the Graphviz output == * Arrows indicate ordering dependencies * Dashed-arrows indicate dependencies that are not present in the transition graph * Actions with a dashed border of any color do not form part of the transition graph * Actions with a green border form part of the transition graph * Actions with a red border are ones the cluster would like to execute but cannot run * Actions with a blue border are ones the cluster does not feel need to be executed * Actions with orange text are pseudo/pretend actions that the cluster uses to simplify the graph * Actions with black text are sent to the LRM * Resource actions have text of the form pass:[rsc]_pass:[action]_pass:[interval] pass:[node] * Any action depending on an action with a red border will not be able to execute. * Loops are _really_ bad. Please report them to the development team. === Small Cluster Transition === image::images/Policy-Engine-small.png["An example transition graph as represented by Graphviz",width="16cm",height="6cm",align="center"] In the above example, it appears that a new node, +node2+, has come online and that the cluster is checking to make sure +rsc1+, +rsc2+ and +rsc3+ are not already running there (Indicated by the +*_monitor_0+ entries). Once it did that, and assuming the resources were not active there, it would have liked to stop +rsc1+ and +rsc2+ on +node1+ and move them to +node2+. However, there appears to be some problem and the cluster cannot or is not permitted to perform the stop actions which implies it also cannot perform the start actions. For some reason the cluster does not want to start +rsc3+ anywhere. For information on the options supported by `crm_simulate`, use the `--help` option. === Complex Cluster Transition === image::images/Policy-Engine-big.png["Another, slightly more complex, transition graph that you're not expected to be able to read",width="16cm",height="20cm",align="center"] == Do I Need to Update the Configuration on all Cluster Nodes? == No. Any changes are immediately synchronized to the other active members of the cluster. To reduce bandwidth, the cluster only broadcasts the incremental updates that result from your changes and uses MD5 checksums to ensure that each copy is completely consistent. diff --git a/doc/Pacemaker_Explained/en-US/Ch-Options.txt b/doc/Pacemaker_Explained/en-US/Ch-Options.txt index d57c6cf15b..0f46bbdcdf 100644 --- a/doc/Pacemaker_Explained/en-US/Ch-Options.txt +++ b/doc/Pacemaker_Explained/en-US/Ch-Options.txt @@ -1,286 +1,286 @@ = Cluster Options = == Special Options == The reason for these fields to be placed at the top level instead of with the rest of cluster options is simply a matter of parsing. These options are used by the configuration database which is, by design, mostly ignorant of the content it holds. So the decision was made to place them in an easy to find location. == Configuration Version == indexterm:[Configuration Version,Cluster] indexterm:[Cluster,Option,Configuration Version] When a node joins the cluster, the cluster will perform a check to see who has the best configuration based on the fields below. It then asks the node with the highest (+admin_epoch+, +epoch+, +num_updates+) tuple to replace the configuration on all the nodes - which makes setting them, and setting them correctly, very important. .Configuration Version Properties [width="95%",cols="2m,5<",options="header",align="center"] |========================================================= |Field |Description | admin_epoch | indexterm:[admin_epoch,Cluster Option] indexterm:[Cluster,Option,admin_epoch] Never modified by the cluster. Use this to make the configurations on any inactive nodes obsolete. _Never set this value to zero_, in such cases the cluster cannot tell the difference between your configuration and the "empty" one used when nothing is found on disk. | epoch | indexterm:[epoch,Cluster Option] indexterm:[Cluster,Option,epoch] Incremented every time the configuration is updated (usually by the admin) | num_updates | indexterm:[num_updates,Cluster Option] indexterm:[Cluster,Option,num_updates] Incremented every time the configuration or status is updated (usually by the cluster) |========================================================= == Other Fields == .Properties Controlling Validation [width="95%",cols="2m,5<",options="header",align="center"] |========================================================= |Field |Description | validate-with | indexterm:[validate-with,Cluster Option] indexterm:[Cluster,Option,validate-with] Determines the type of validation being done on the configuration. If set to "none", the cluster will not verify that updates conform to the DTD (nor reject ones that don't). This option can be useful when operating a mixed version cluster during an upgrade. |========================================================= == Fields Maintained by the Cluster == .Properties Maintained by the Cluster [width="95%",cols="2m,5<",options="header",align="center"] |========================================================= |Field |Description |cib-last-written | indexterm:[cib-last-written,Cluster Property] indexterm:[Cluster,Property,cib-last-written] Indicates when the configuration was last written to disk. Informational purposes only. |dc-uuid | indexterm:[dc-uuid,Cluster Property] indexterm:[Cluster,Property,dc-uuid] Indicates which cluster node is the current leader. Used by the cluster when placing resources and determining the order of some events. |have-quorum | indexterm:[have-quorum,Cluster Property] indexterm:[Cluster,Property,have-quorum] Indicates if the cluster has quorum. If false, this may mean that the cluster cannot start resources or fence other nodes. See +no-quorum-policy+ below. |========================================================= Note that although these fields can be written to by the admin, in most cases the cluster will overwrite any values specified by the admin with the "correct" ones. To change the +admin_epoch+, for example, one would use: [source,C] -# cibadmin --modify --crm_xml ‘<cib admin_epoch="42"/>' +# cibadmin --modify --crm_xml '' A complete set of fields will look something like this: .An example of the fields set for a cib object ====== [source,XML] ------- ------- ====== == Cluster Options == Cluster options, as you might expect, control how the cluster behaves when confronted with certain situations. They are grouped into sets and, in advanced configurations, there may be more than one. footnote:[This will be described later in the section on <> where we will show how to have the cluster use different sets of options during working hours (when downtime is usually to be avoided at all costs) than it does during the weekends (when resources can be moved to the their preferred hosts without bothering end users)] For now we will describe the simple case where each option is present at most once. == Available Cluster Options == .Cluster Options [width="95%",cols="5m,2,11<",options="header",align="center"] |========================================================= |Option |Default |Description | batch-limit | 30 | indexterm:[batch-limit,Cluster Option] indexterm:[Cluster,Option,batch-limit] The number of jobs that the TE is allowed to execute in parallel. The "correct" value will depend on the speed and load of your network and cluster nodes. | migration-limit | -1 (unlimited) | indexterm:[migration-limit,Cluster Option] indexterm:[Cluster,Option,migration-limit] The number of migration jobs that the TE is allowed to execute in parallel on a node. | no-quorum-policy | stop | indexterm:[no-quorum-policy,Cluster Option] indexterm:[Cluster,Option,no-quorum-policy] What to do when the cluster does not have quorum. Allowed values: * ignore - continue all resource management * freeze - continue resource management, but don't recover resources from nodes not in the affected partition * stop - stop all resources in the affected cluster partition * suicide - fence all nodes in the affected cluster partition | symmetric-cluster | TRUE | indexterm:[symmetric-cluster,Cluster Option] indexterm:[Cluster,Option,symmetric-cluster] Can all resources run on any node by default? | stonith-enabled | TRUE | indexterm:[stonith-enabled,Cluster Option] indexterm:[Cluster,Option,stonith-enabled] Should failed nodes and nodes with resources that can't be stopped be shot? If you value your data, set up a STONITH device and enable this. If true, or unset, the cluster will refuse to start resources unless one or more STONITH resources have been configured also. | stonith-action | reboot | indexterm:[stonith-action,Cluster Option] indexterm:[Cluster,Option,stonith-action] Action to send to STONITH device. Allowed values: reboot, off. The value 'poweroff' is also allowed, but is only used for legacy devices. | cluster-delay | 60s | indexterm:[cluster-delay,Cluster Option] indexterm:[Cluster,Option,cluster-delay] Round trip delay over the network (excluding action execution). The "correct" value will depend on the speed and load of your network and cluster nodes. | stop-orphan-resources | TRUE | indexterm:[stop-orphan-resources,Cluster Option] indexterm:[Cluster,Option,stop-orphan-resources] Should deleted resources be stopped? | stop-orphan-actions | TRUE | indexterm:[stop-orphan-actions,Cluster Option] indexterm:[Cluster,Option,stop-orphan-actions] Should deleted actions be cancelled? | start-failure-is-fatal | TRUE | indexterm:[start-failure-is-fatal,Cluster Option] indexterm:[Cluster,Option,start-failure-is-fatal] When set to FALSE, the cluster will instead use the resource's +failcount+ and value for +resource-failure-stickiness+. | pe-error-series-max | -1 (all) | indexterm:[pe-error-series-max,Cluster Option] indexterm:[Cluster,Option,pe-error-series-max] The number of PE inputs resulting in ERRORs to save. Used when reporting problems. | pe-warn-series-max | -1 (all) | indexterm:[pe-warn-series-max,Cluster Option] indexterm:[Cluster,Option,pe-warn-series-max] The number of PE inputs resulting in WARNINGs to save. Used when reporting problems. | pe-input-series-max | -1 (all) | indexterm:[pe-input-series-max,Cluster Option] indexterm:[Cluster,Option,pe-input-series-max] The number of "normal" PE inputs to save. Used when reporting problems. |========================================================= You can always obtain an up-to-date list of cluster options, including their default values, by running the `pengine metadata` command. == Querying and Setting Cluster Options == indexterm:[Querying,Cluster Option] indexterm:[Setting,Cluster Option] indexterm:[Cluster,Querying Options] indexterm:[Cluster,Setting Options] Cluster options can be queried and modified using the `crm_attribute` tool. To get the current value of +cluster-delay+, simply use: [source,C] # crm_attribute --attr-name cluster-delay --get-value which is more simply written as [source,C] # crm_attribute --get-value -n cluster-delay If a value is found, you'll see a result like this: [source,C] # crm_attribute --get-value -n cluster-delay name=cluster-delay value=60s However, if no value is found, the tool will display an error: [source,C] # crm_attribute --get-value -n clusta-deway` name=clusta-deway value=(null) Error performing operation: The object/attribute does not exist To use a different value, eg. +30+, simply run: [source,C] # crm_attribute --attr-name cluster-delay --attr-value 30s To go back to the cluster's default value you can delete the value, for example with this command: [source,C] # crm_attribute --attr-name cluster-delay --delete-attr == When Options are Listed More Than Once == If you ever see something like the following, it means that the option you're modifying is present more than once. .Deleting an option that is listed twice ======= [source,C] ------ # crm_attribute --attr-name batch-limit --delete-attr Multiple attributes match name=batch-limit in crm_config: Value: 50 (set=cib-bootstrap-options, id=cib-bootstrap-options-batch-limit) Value: 100 (set=custom, id=custom-batch-limit) Please choose from one of the matches above and supply the 'id' with --attr-id ------- ======= In such cases follow the on-screen instructions to perform the requested action. To determine which value is currently being used by the cluster, please refer to <>. diff --git a/doc/Pacemaker_Explained/en-US/Ch-Rules.txt b/doc/Pacemaker_Explained/en-US/Ch-Rules.txt index 6d8bf3d0ab..4f80983915 100644 --- a/doc/Pacemaker_Explained/en-US/Ch-Rules.txt +++ b/doc/Pacemaker_Explained/en-US/Ch-Rules.txt @@ -1,556 +1,561 @@ = Rules = -[[ch-rules]] +//// +We prefer [[ch-rules]], but older versions of asciidoc dont deal well +with that construct for chapter headings +//// + +anchor:ch-rules[Chapter 8, Rules] indexterm:[Resource,Constraint,Rule] Rules can be used to make your configuration more dynamic. One common example is to set one value for +resource-stickiness+ during working hours, to prevent resources from being moved back to their most preferred location, and another on weekends when no-one is around to notice an outage. Another use of rules might be to assign machines to different processing groups (using a node attribute) based on time and to then use that attribute when creating location constraints. Each rule can contain a number of expressions, date-expressions and even other rules. The results of the expressions are combined based on the rule's +boolean-op+ field to determine if the rule ultimately evaluates to +true+ or +false+. What happens next depends on the context in which the rule is being used. .Properties of a Rule [width="95%",cols="2m,5<",options="header",align="center"] |========================================================= |Field |Description |role |Limits the rule to apply only when the resource is in that role. Allowed values: _Started_, +Slave,+ and +Master+. NOTE: A rule with +role="Master"+ can not determine the initial location of a clone instance. It will only affect which of the active instances will be promoted. indexterm:[role,Constraint Rule] indexterm:[Constraint,Rule,role] |score |The score to apply if the rule evaluates to +true+. Limited to use in rules that are part of location constraints. indexterm:[score,Constraint Rule] indexterm:[Constraint,Rule,score] |score-attribute |The node attribute to look up and use as a score if the rule evaluates to +true+. Limited to use in rules that are part of location constraints. indexterm:[score-attribute,Constraint Rule] indexterm:[Constraint,Rule,score-attribute] |boolean-op |How to combine the result of multiple expression objects. Allowed values: _and_ and +or+. indexterm:[boolean-op,Constraint Rule] indexterm:[Constraint,Rule,boolean-op] |========================================================= == Node Attribute Expressions == indexterm:[Resource,Constraint,Attribute Expression] Expression objects are used to control a resource based on the attributes defined by a node or nodes. In addition to any attributes added by the administrator, each node has a built-in node attribute called +#uname+ that can also be used. .Properties of an Expression [width="95%",cols="2m,5<",options="header",align="center"] |========================================================= |Field |Description |value |User supplied value for comparison indexterm:[value,Constraint Expression] indexterm:[Constraint,Attribute Expression,value] |attribute |The node attribute to test indexterm:[attribute,Constraint Expression] indexterm:[Constraint,Attribute Expression,attribute] |type |Determines how the value(s) should be tested. Allowed values: _string_, +integer+, +version+ indexterm:[type,Constraint Expression] indexterm:[Constraint,Attribute Expression,type] |operation |The comparison to perform. Allowed values: * 'lt' - True if the node attribute's value is less than +value+ * 'gt' - True if the node attribute's value is greater than +value+ * 'lte' - True if the node attribute's value is less than or equal to +value+ * 'gte' - True if the node attribute's value is greater than or equal to +value+ * 'eq' - True if the node attribute's value is equal to +value+ * 'ne' - True if the node attribute's value is not equal to +value+ * 'defined' - True if the node has the named attribute * 'not_defined' - True if the node does not have the named attribute indexterm:[operation,Constraint Expression] indexterm:[Constraint,Attribute Expression,operation] |========================================================= == Time/Date Based Expressions == indexterm:[Time Based Expressions] indexterm:[Resource,Constraint,Date/Time Expression] As the name suggests, +date_expressions+ are used to control a resource or cluster option based on the current date/time. They can contain an optional +date_spec+ and/or +duration+ object depending on the context. .Properties of a Date Expression [width="95%",cols="2m,5<",options="header",align="center"] |========================================================= |Field |Description |start |A date/time conforming to the ISO8601 specification. indexterm:[start,Constraint Expression] indexterm:[Constraint,Date/Time Expression,start] |end |A date/time conforming to the ISO8601 specification. Can be inferred by supplying a value for +start+ and a +duration+. indexterm:[end,Constraint Expression] indexterm:[Constraint,Date/Time Expression,end] |operation |Compares the current date/time with the start and/or end date, depending on the context. Allowed values: * 'gt' - True if the current date/time is after +start+ * 'lt' - True if the current date/time is before +end+ * 'in-range' - True if the current date/time is after +start+ and before +end+ * 'date-spec' - performs a cron-like comparison to the current date/time indexterm:[operation,Constraint Expression] indexterm:[Constraint,Date/Time Expression,operation] |========================================================= [NOTE] ====== As these comparisons (except for +date_spec+) include the time, the +eq+, +neq+, +gte+ and +lte+ operators have not been implemented since they would only be valid for a single second. ====== === Date Specifications === indexterm:[Date Specification] indexterm:[Resource,Constraint,Date Specification] +date_spec+ objects are used to create cron-like expressions relating to time. Each field can contain a single number or a single range. Instead of defaulting to zero, any field not supplied is ignored. For example, +monthdays="1"+ matches the first day of every month and +hours="09-17"+ matches the hours between 9am and 5pm (inclusive). However, at this time one cannot specify +weekdays="1,2"+ or +weekdays="1-2,5-6"+ since they contain multiple ranges. Depending on demand, this may be implemented in a future release. .Properties of a Date Spec [width="95%",cols="2m,5<",options="header",align="center"] |========================================================= |Field |Description |id |A unique name for the date indexterm:[id,Date Specification] indexterm:[Constraint,Date Specification,id] |hours |Allowed values: 0-23 indexterm:[hours,Date Specification] indexterm:[Constraint,Date Specification,hours] |monthdays |Allowed values: 0-31 (depending on month and year) indexterm:[monthdays,Date Specification] indexterm:[Constraint,Date Specification,monthdays] |weekdays |Allowed values: 1-7 (1=Monday, 7=Sunday) indexterm:[weekdays,Date Specification] indexterm:[Constraint,Date Specification,weekdays] |yeardays |Allowed values: 1-366 (depending on the year) indexterm:[yeardays,Date Specification] indexterm:[Constraint,Date Specification,yeardays] |months |Allowed values: 1-12 indexterm:[months,Date Specification] indexterm:[Constraint,Date Specification,months] |weeks |Allowed values: 1-53 (depending on weekyear) indexterm:[weeks,Date Specification] indexterm:[Constraint,Date Specification,weeks] |years |Year according the Gregorian calendar indexterm:[years,Date Specification] indexterm:[Constraint,Date Specification,years] |weekyears |May differ from Gregorian years; Eg. +2005-001 Ordinal+ is also +2005-01-01 Gregorian+ is also +2004-W53-6 Weekly+ indexterm:[weekyears,Date Specification] indexterm:[Constraint,Date Specification,weekyears] |moon |Allowed values: 0-7 (0 is new, 4 is full moon). Seriously, you can use this. This was implemented to demonstrate the ease with which new comparisons could be added. indexterm:[moon,Date Specification] indexterm:[Constraint,Date Specification,moon] |========================================================= === Durations === indexterm:[Duration] indexterm:[Resource,Constraint,Duration] Durations are used to calculate a value for +end+ when one is not supplied to in_range operations. They contain the same fields as +date_spec+ objects but without the limitations (ie. you can have a duration of 19 months). Like +date_specs+, any field not supplied is ignored. == Sample Time Based Expressions == A small sample of how time based expressions can be used. //// On older versions of asciidoc, the [source] directive makes the title dissappear //// .True if now is any time in the year 2005 ==== [source,XML] ---- ---- ==== .Equivalent expression ==== [source,XML] ---- ---- ==== .9am-5pm, Mon-Friday ==== [source,XML] ------- ------- ==== Please note that the +16+ matches up to +16:59:59+, as the numeric value (hour) still matches! .9am-6pm, Mon-Friday, or all day saturday ==== [source,XML] ------- ------- ==== .9am-5pm or 9pm-12pm, Mon-Friday ==== [source,XML] ------- ------- ==== .Mondays in March 2005 ==== [source,XML] ------- ------- ==== [NOTE] ====== Because no time is specified, 00:00:00 is implied. This means that the range includes all of 2005-03-01 but none of 2005-04-01. You may wish to write +end="2005-03-31T23:59:59"+ to avoid confusion. ====== .A full moon on Friday the 13th ===== [source,XML] ------- ------- ===== == Using Rules to Determine Resource Location == indexterm:[Rule,Determine Resource Location] indexterm:[Resource,Location,Determine by Rules] If the constraint's outer-most rule evaluates to +false+, the cluster treats the constraint as if it was not there. When the rule evaluates to +true+, the node's preference for running the resource is updated with the score associated with the rule. If this sounds familiar, its because you have been using a simplified syntax for location constraint rules already. Consider the following location constraint: .Prevent myApacheRsc from running on c001n03 ===== [source,XML] ------- ------- ===== This constraint can be more verbosely written as: .Prevent myApacheRsc from running on c001n03 - expanded version ===== [source,XML] ------- ------- ===== The advantage of using the expanded form is that one can then add extra clauses to the rule, such as limiting the rule such that it only applies during certain times of the day or days of the week (this is discussed in subsequent sections). It also allows us to match on node properties other than its name. If we rated each machine's CPU power such that the cluster had the following nodes section: .A sample nodes section for use with score-attribute ===== [source,XML] ------- ------- ===== then we could prevent resources from running on underpowered machines with the rule [source,XML] ------- ------- === Using +score-attribute+ Instead of +score+ === When using +score-attribute+ instead of +score+, each node matched by the rule has its score adjusted differently, according to its value for the named node attribute. Thus, in the previous example, if a rule used +score-attribute="cpu_mips"+, +c001n01+ would have its preference to run the resource increased by +1234+ whereas +c001n02+ would have its preference increased by +5678+. == Using Rules to Control Resource Options == Often some cluster nodes will be different from their peers; sometimes these differences (the location of a binary or the names of network interfaces) require resources to be configured differently depending on the machine they're hosted on. By defining multiple +instance_attributes+ objects for the resource and adding a rule to each, we can easily handle these special cases. In the example below, +mySpecialRsc+ will use eth1 and port 9999 when run on +node1+, eth2 and port 8888 on +node2+ and default to eth0 and port 9999 for all other nodes. .Defining different resource options based on the node name ===== [source,XML] ------- ------- ===== The order in which +instance_attributes+ objects are evaluated is determined by their score (highest to lowest). If not supplied, score defaults to zero and objects with an equal score are processed in listed order. If the +instance_attributes+ object does not have a +rule+ or has a +rule+ that evaluates to +true+, then for any parameter the resource does not yet have a value for, the resource will use the parameter values defined by the +instance_attributes+ object. == Using Rules to Control Cluster Options == indexterm:[Rule,Controlling Cluster Options] indexterm:[Cluster,Setting Options with Rules] Controlling cluster options is achieved in much the same manner as specifying different resource options on different nodes. The difference is that because they are cluster options, one cannot (or should not, because they won't work) use attribute based expressions. The following example illustrates how to set a different +resource-stickiness+ value during and outside of work hours. This allows resources to automatically move back to their most preferred hosts, but at a time that (in theory) does not interfere with business activities. .Change +resource-stickiness+ during working hours ===== [source,XML] ------- ------- ===== [[s-rules-recheck]] == Ensuring Time Based Rules Take Effect == A Pacemaker cluster is an event driven system. As such, it won't recalculate the best place for resources to run in unless something (like a resource failure or configuration change) happens. This can mean that a location constraint that only allows resource X to run between 9am and 5pm is not enforced. If you rely on time based rules, it is essential that you set the +cluster-recheck-interval+ option. This tells the cluster to periodically recalculate the ideal state of the cluster. For example, if you set +cluster-recheck-interval=5m+, then sometime between 9:00 and 9:05 the cluster would notice that it needs to start resource X, and between 17:00 and 17:05 it would realize that X needed to be stopped. Note that the timing of the actual start and stop actions depends on what else needs to be performed first . diff --git a/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt b/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt index 4c831db8a7..1df1b9fe29 100644 --- a/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt +++ b/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt @@ -1,308 +1,314 @@ -[[ch-stonith]] = Configure STONITH = +//// +We prefer [[ch-stonith]], but older versions of asciidoc dont deal well +with that construct for chapter headings +//// +anchor:ch-stonith[Chapter 13, STONITH] +indexterm:[STONITH, Configuration] + == What Is STONITH == STONITH is an acronym for Shoot-The-Other-Node-In-The-Head and it protects your data from being corrupted by rogue nodes or concurrent access. Just because a node is unresponsive, this doesn't mean it isn't accessing your data. The only way to be 100% sure that your data is safe, is to use STONITH so we can be certain that the node is truly offline, before allowing the data to be accessed from another node. STONITH also has a role to play in the event that a clustered service cannot be stopped. In this case, the cluster uses STONITH to force the whole node offline, thereby making it safe to start the service elsewhere. == What STONITH Device Should You Use == It is crucial that the STONITH device can allow the cluster to differentiate between a node failure and a network one. The biggest mistake people make in choosing a STONITH device is to use remote power switch (such as many on-board IMPI controllers) that shares power with the node it controls. In such cases, the cluster cannot be sure if the node is really offline, or active and suffering from a network fault. Likewise, any device that relies on the machine being active (such as SSH-based "devices" used during testing) are inappropriate. == Configuring STONITH == ifdef::pcs[] . Find the correct driver: +pcs stonith list+ . Find the parameters associated with the device: +pcs stonith describe + . Create a local config to make changes to +pcs cluster cib stonith_cfg+ . Create the fencing resource using +pcs -f stonith_cfg stonith create [stonith device options]+ . Set stonith-enable to true. +pcs -f stonith_cfg property set stonith-enabled=true+ endif::pcs[] -ifdef::crm[] +ifdef::crmsh[] . Find the correct driver: +stonith_admin --list-installed+ . Since every device is different, the parameters needed to configure it will vary. To find out the parameters associated with the device, run: +stonith_admin --metadata --agent type+ The output should be XML formatted text containing additional parameter descriptions. We will endevor to make the output more friendly in a later version. . Enter the shell crm Create an editable copy of the existing configuration +cib new stonith+ Create a fencing resource containing a primitive resource with a class of stonith, a type of type and a parameter for each of the values returned in step 2: +configure primitive ...+ -endif::crm[] +endif::crmsh[] . If the device does not know how to fence nodes based on their uname, you may also need to set the special +pcmk_host_map+ parameter. See +man stonithd+ for details. . If the device does not support the list command, you may also need to set the special +pcmk_host_list+ and/or +pcmk_host_check+ parameters. See +man stonithd+ for details. . If the device does not expect the victim to be specified with the port parameter, you may also need to set the special +pcmk_host_argument+ parameter. See +man stonithd+ for details. -ifdef::crm[] +ifdef::crmsh[] . Upload it into the CIB from the shell: +cib commit stonith+ -endif::crm[] +endif::crmsh[] ifdef::pcs[] . Commit the new configuration. +pcs cluster push cib stonith_cfg+ endif::pcs[] . Once the stonith resource is running, you can test it by executing: +stonith_admin --reboot nodename+. Although you might want to stop the cluster on that machine first. == Example == Assuming we have an chassis containing four nodes and an IPMI device active on 10.0.0.1, then we would chose the fence_ipmilan driver in step 2 and obtain the following list of parameters .Obtaining a list of STONITH Parameters ifdef::pcs[] [source,Bash] ---- # pcs stonith describe fence_ipmilan Stonith options for: fence_ipmilan auth: IPMI Lan Auth type (md5, password, or none) ipaddr: IPMI Lan IP to talk to passwd: Password (if required) to control power on IPMI device passwd_script: Script to retrieve password (if required) lanplus: Use Lanplus login: Username/Login (if required) to control power on IPMI device action: Operation to perform. Valid operations: on, off, reboot, status, list, diag, monitor or metadata timeout: Timeout (sec) for IPMI operation cipher: Ciphersuite to use (same as ipmitool -C parameter) method: Method to fence (onoff or cycle) power_wait: Wait X seconds after on/off operation delay: Wait X seconds before fencing is started privlvl: Privilege level on IPMI device verbose: Verbose mode ---- endif::pcs[] -ifdef::crm[] +ifdef::crmsh[] [source,C] ---- # stonith_admin --metadata -a fence_ipmilan ---- [source,XML] ---- fence_ipmilan is an I/O Fencing agent which can be used with machines controlled by IPMI. This agent calls support software using ipmitool (http://ipmitool.sf.net/). To use fence_ipmilan with HP iLO 3 you have to enable lanplus option (lanplus / -P) and increase wait after operation to 4 seconds (power_wait=4 / -T 4) IPMI Lan Auth type (md5, password, or none) IPMI Lan IP to talk to Password (if required) to control power on IPMI device Script to retrieve password (if required) Use Lanplus Username/Login (if required) to control power on IPMI device Operation to perform. Valid operations: on, off, reboot, status, list, diag, monitor or metadata Timeout (sec) for IPMI operation Ciphersuite to use (same as ipmitool -C parameter) Method to fence (onoff or cycle) Wait X seconds after on/off operation Wait X seconds before fencing is started Verbose mode ---- -endif::crm[] +endif::crmsh[] from which we would create a STONITH resource fragment that might look like this .Sample STONITH Resource ifdef::pcs[] [source,Bash] ---- # pcs cluster cib stonith_cfg # pcs -f stonith_cfg stonith create impi-fencing fence_ipmilan \ pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser \ passwd=acd123 op monitor interval=60s # pcs -f stonith_cfg stonith impi-fencing (stonith:fence_ipmilan) Stopped ---- endif::pcs[] -ifdef::crm[] +ifdef::crmsh[] [source,Bash] ---- # crm crm(live)# cib new stonith INFO: stonith shadow CIB created crm(stonith)# configure primitive impi-fencing stonith::fence_ipmilan \ params pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser passwd=abc123 \ op monitor interval="60s" ---- -endif::crm[] +endif::crmsh[] And finally, since we disabled it earlier, we need to re-enable STONITH. At this point we should have the following configuration. ifdef::pcs[] [source,Bash] ---- # pcs -f stonith_cfg property set stonith-enabled=true # pcs -f stonith_cfg property dc-version: 1.1.8-1.el7-60a19ed12fdb4d5c6a6b6767f52e5391e447fec0 cluster-infrastructure: corosync no-quorum-policy: ignore stonith-enabled: true ---- endif::pcs[] Now push the configuration into the cluster. ifdef::pcs[] [source,C] ---- # pcs cluster push cib stonith_cfg ---- endif::pcs[] -ifdef::crm[] +ifdef::crmsh[] [source,Bash] ---- crm(stonith)# configure property stonith-enabled="true" crm(stonith)# configure shownode pcmk-1 node pcmk-2 primitive WebData ocf:linbit:drbd \ params drbd_resource="wwwdata" \ op monitor interval="60s" primitive WebFS ocf:heartbeat:Filesystem \ params device="/dev/drbd/by-res/wwwdata" directory="/var/www/html" fstype="gfs2" primitive WebSite ocf:heartbeat:apache \ params configfile="/etc/httpd/conf/httpd.conf" \ op monitor interval="1min" primitive ClusterIP ocf:heartbeat:IPaddr2 \ params ip="192.168.122.101" cidr_netmask="32" clusterip_hash="sourceip" \ op monitor interval="30s"primitive ipmi-fencing stonith::fence_ipmilan \ params pcmk_host_list="pcmk-1 pcmk-2" ipaddr=10.0.0.1 login=testuser passwd=abc123 \ op monitor interval="60s"ms WebDataClone WebData \ meta master-max="2" master-node-max="1" clone-max="2" clone-node-max="1" notify="true" clone WebFSClone WebFS clone WebIP ClusterIP \ meta globally-unique="true" clone-max="2" clone-node-max="2" clone WebSiteClone WebSite colocation WebSite-with-WebFS inf: WebSiteClone WebFSClone colocation fs_on_drbd inf: WebFSClone WebDataClone:Master colocation website-with-ip inf: WebSiteClone WebIP order WebFS-after-WebData inf: WebDataClone:promote WebFSClone:start order WebSite-after-WebFS inf: WebFSClone WebSiteClone order apache-after-ip inf: WebIP WebSiteClone property $id="cib-bootstrap-options" \ dc-version="1.1.5-bdd89e69ba545404d02445be1f3d72e6a203ba2f" \ cluster-infrastructure="openais" \ expected-quorum-votes="2" \ stonith-enabled="true" \ no-quorum-policy="ignore" rsc_defaults $id="rsc-options" \ resource-stickiness="100" crm(stonith)# cib commit stonithINFO: commited 'stonith' shadow CIB to the cluster crm(stonith)# quit bye ---- -endif::crm[] +endif::crmsh[] diff --git a/doc/coding_guidelines.txt b/doc/coding_guidelines.txt new file mode 100644 index 0000000000..079d7c73b0 --- /dev/null +++ b/doc/coding_guidelines.txt @@ -0,0 +1,130 @@ += Pacemaker Coding Guidelines = + +== Table of Contents == + +1. Introduction +2. Formatting Guidelines +3. Naming Conventions +4. vim Settings + +== Introduction == + +The purpose of this document is to discuss guidelines about how to write +code that will be a part of the Pacemaker project. + +== Formatting Guidelines == + +=== Whitespace === + +- Indentation must be 4 spaces, no tabs. +- Do not leave trailing whitespace. + +=== Line Length === + +- Lines should be no longer than 80 characters unless limiting line length significantly impacts readability. + +=== Pointers === + +- The '*' goes by the variable name, not the type: + +``` + char *foo; +``` + +- Use a space before the '*' and after the closing parenthesis in a cast: + +``` + char *foo = (char *) bar; +``` + +=== Functions === + +- Put the return type on its own line: +- Place the opening brace for a function on the next line: + +``` + static int + foo(void) + { +``` + +- For functions with enough arguments that they must break to the next line, align arguments with the first argument: + +``` + static int + function_name(int bar, const char *a, const char *b, + const char *c, const char *d) + { +``` + +- If a function name gets really long, start the arguments on their own line with 8 spaces of indentation: + +``` + static int + really_really_long_function_name_this_is_getting_silly_now( + int bar, const char *a, const char *b, + const char *c, const char *d) + { +``` + +=== Control statements (if, else, while, for, switch) === + +- Keyword is followed by one space, then left parenthesis witout space, + condition, right parenthesis, space, opening bracket on the same line. + +- "else" and "else if" are on the same line with ending brace and opening + brace, separated by space + +``` + if (condition1) { + statement1; + } else if (condition2) { + statement2; + } else { + statement3; + } +``` + +- Cases in switch statement have same indentation as switch. Body of cases + is indented by one level. Opening brace is on the same line as switch. + +``` + switch (expression) + { + case 0: + command0; + break; + case 1: + command1; + break; + default: + command; + } +``` + +=== Operators === + +- Operators have spaces from both sides. Do not rely on operator precedence, + use brackets when mixing operators with different priority. +- No space after opening bracked and before closing bracket. + +``` + x = a + b - (c * d); +``` + +== Naming Conventions == + +- Public C API calls and type names must begin with an API specific prefix, eg. "crm_", "pe_", "st_", "lrm_". + +== vim Settings == + +```vim + " This section contains settings that can be placed in the vimrc file that are + " compatible with the Pacemaker coding guidelines. + + " Whitespace + set ts=4 + set sw=4 + set expandtab + let c_space_error=1 +``` diff --git a/doc/publican-clusterlabs/xsl/html.xsl b/doc/publican-clusterlabs/xsl/html.xsl index 847a595da3..62f8728207 100644 --- a/doc/publican-clusterlabs/xsl/html.xsl +++ b/doc/publican-clusterlabs/xsl/html.xsl @@ -1,30 +1,34 @@ ]> - + + diff --git a/fencing/commands.c b/fencing/commands.c index 26434117d4..2c5b63c4b5 100644 --- a/fencing/commands.c +++ b/fencing/commands.c @@ -1,1697 +1,1885 @@ /* * Copyright (C) 2009 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include GHashTable *device_list = NULL; GHashTable *topology = NULL; GList *cmd_list = NULL; static int active_children = 0; + +struct device_search_s +{ + char *host; + char *action; + int per_device_timeout; + int replies_needed; + int replies_received; + + void *user_data; + void (*callback) (GList *devices, void *user_data); + GListPtr capable; +}; + static gboolean stonith_device_dispatch(gpointer user_data); static void st_child_done(GPid pid, int rc, const char *output, gpointer user_data); +static void +stonith_send_reply(xmlNode *reply, int call_options, const char *remote_peer, const char *client_id); + +static void search_devices_record_result(struct device_search_s *search, const char *device, gboolean can_fence); typedef struct async_command_s { int id; int pid; int fd_stdout; int options; int default_timeout; int timeout; char *op; char *origin; char *client; char *client_name; char *remote_op_id; char *victim; char *action; char *device; char *mode; GListPtr device_list; GListPtr device_next; - void (*done)(GPid pid, int rc, const char *output, gpointer user_data); + void *internal_user_data; + void (*done_cb)(GPid pid, int rc, const char *output, gpointer user_data); guint timer_sigterm; guint timer_sigkill; /*! If the operation timed out, this is the last signal * we sent to the process to get it to terminate */ int last_timeout_signo; } async_command_t; static xmlNode * stonith_construct_async_reply(async_command_t *cmd, const char *output, xmlNode *data, int rc); static int get_action_timeout(stonith_device_t *device, const char *action, int default_timeout) { char buffer[512] = { 0, }; char *value = NULL; CRM_CHECK(action != NULL, return default_timeout); if (!device->params) { return default_timeout; } snprintf(buffer, sizeof(buffer) - 1, "pcmk_%s_timeout", action); value = g_hash_table_lookup(device->params, buffer); if (!value) { return default_timeout; } return atoi(value); } static void free_async_command(async_command_t *cmd) { if (!cmd) { return; } cmd_list = g_list_remove(cmd_list, cmd); - g_list_free(cmd->device_list); + g_list_free_full(cmd->device_list, free); free(cmd->device); free(cmd->action); free(cmd->victim); free(cmd->remote_op_id); free(cmd->client); free(cmd->client_name); free(cmd->origin); free(cmd->op); free(cmd); } static async_command_t *create_async_command(xmlNode *msg) { async_command_t *cmd = NULL; xmlNode *op = get_xpath_object("//@"F_STONITH_ACTION, msg, LOG_ERR); const char *action = crm_element_value(op, F_STONITH_ACTION); CRM_CHECK(action != NULL, crm_log_xml_warn(msg, "NoAction"); return NULL); crm_log_xml_trace(msg, "Command"); cmd = calloc(1, sizeof(async_command_t)); crm_element_value_int(msg, F_STONITH_CALLID, &(cmd->id)); crm_element_value_int(msg, F_STONITH_CALLOPTS, &(cmd->options)); crm_element_value_int(msg, F_STONITH_TIMEOUT, &(cmd->default_timeout)); cmd->timeout = cmd->default_timeout; cmd->origin = crm_element_value_copy(msg, F_ORIG); cmd->remote_op_id = crm_element_value_copy(msg, F_STONITH_REMOTE_OP_ID); cmd->client = crm_element_value_copy(msg, F_STONITH_CLIENTID); cmd->client_name = crm_element_value_copy(msg, F_STONITH_CLIENTNAME); cmd->op = crm_element_value_copy(msg, F_STONITH_OPERATION); cmd->action = strdup(action); cmd->victim = crm_element_value_copy(op, F_STONITH_TARGET); cmd->mode = crm_element_value_copy(op, F_STONITH_MODE); cmd->device = crm_element_value_copy(op, F_STONITH_DEVICE); CRM_CHECK(cmd->op != NULL, crm_log_xml_warn(msg, "NoOp"); free_async_command(cmd); return NULL); CRM_CHECK(cmd->client != NULL, crm_log_xml_warn(msg, "NoClient")); + cmd->done_cb = st_child_done; cmd_list = g_list_append(cmd_list, cmd); return cmd; } static int stonith_manual_ack(xmlNode *msg, remote_fencing_op_t *op) { async_command_t *cmd = create_async_command(msg); xmlNode *dev = get_xpath_object("//@"F_STONITH_TARGET, msg, LOG_ERR); if(cmd == NULL) { return -EINVAL; } cmd->device = strdup("manual_ack"); cmd->remote_op_id = strdup(op->id); crm_notice("Injecting manual confirmation that %s is safely off/down", crm_element_value(dev, F_STONITH_TARGET)); - st_child_done(0, 0, NULL, cmd); + cmd->done_cb(0, 0, NULL, cmd); return pcmk_ok; } static gboolean stonith_device_execute(stonith_device_t *device) { int rc = 0; int exec_rc = 0; async_command_t *cmd = NULL; stonith_action_t *action = NULL; CRM_CHECK(device != NULL, return FALSE); if(device->active_pid) { crm_trace("%s is still active with pid %u", device->id, device->active_pid); return TRUE; } if(device->pending_ops) { GList *first = device->pending_ops; device->pending_ops = g_list_remove_link(device->pending_ops, first); cmd = first->data; g_list_free_1(first); } if(cmd == NULL) { crm_trace("Nothing further to do for %s", device->id); return TRUE; } action = stonith_action_create(device->agent, cmd->action, cmd->victim, cmd->timeout, device->params, device->aliases); - exec_rc = stonith_action_execute_async(action, (void *) cmd, st_child_done); + /* for async exec, exec_rc is pid if positive and error code if negative/zero */ + exec_rc = stonith_action_execute_async(action, (void *) cmd, cmd->done_cb); if(exec_rc > 0) { crm_debug("Operation %s%s%s on %s now running with pid=%d, timeout=%dms", cmd->action, cmd->victim?" for node ":"", cmd->victim?cmd->victim:"", device->id, exec_rc, cmd->timeout); device->active_pid = exec_rc; } else { crm_warn("Operation %s%s%s on %s failed (%d/%d)", cmd->action, cmd->victim?" for node ":"", cmd->victim?cmd->victim:"", device->id, exec_rc, rc); - st_child_done(0, rc<0?rc:exec_rc, NULL, cmd); + cmd->done_cb(0, rc<0?rc:exec_rc, NULL, cmd); } return TRUE; } static gboolean stonith_device_dispatch(gpointer user_data) { return stonith_device_execute(user_data); } static void schedule_stonith_command(async_command_t *cmd, stonith_device_t *device) { CRM_CHECK(cmd != NULL, return); CRM_CHECK(device != NULL, return); if (cmd->device) { free(cmd->device); } cmd->device = strdup(device->id); cmd->timeout = get_action_timeout(device, cmd->action, cmd->default_timeout); if (cmd->remote_op_id) { - crm_debug("Scheduling %s on %s for remote peer %s with op id (%s) (timeout=%dms)", + crm_debug("Scheduling %s on %s for remote peer %s with op id (%s) (timeout=%ds)", cmd->action, device->id, cmd->origin, cmd->remote_op_id, cmd->timeout); } else { - crm_debug("Scheduling %s on %s for %s (timeout=%dms)", + crm_debug("Scheduling %s on %s for %s (timeout=%ds)", cmd->action, device->id, cmd->client, cmd->timeout); } device->pending_ops = g_list_append(device->pending_ops, cmd); mainloop_set_trigger(device->work); } void free_device(gpointer data) { GListPtr gIter = NULL; stonith_device_t *device = data; g_hash_table_destroy(device->params); g_hash_table_destroy(device->aliases); for(gIter = device->pending_ops; gIter != NULL; gIter = gIter->next) { async_command_t *cmd = gIter->data; crm_warn("Removal of device '%s' purged operation %s", device->id, cmd->action); - st_child_done(0, -ENODEV, NULL, cmd); + cmd->done_cb(0, -ENODEV, NULL, cmd); free_async_command(cmd); } g_list_free(device->pending_ops); g_list_free_full(device->targets, free); free(device->namespace); free(device->on_target_actions); free(device->agent); free(device->id); free(device); } static GHashTable *build_port_aliases(const char *hostmap, GListPtr *targets) { char *name = NULL; int last = 0, lpc = 0, max = 0, added = 0; GHashTable *aliases = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); if(hostmap == NULL) { return aliases; } max = strlen(hostmap); for(; lpc <= max; lpc++) { switch(hostmap[lpc]) { /* Assignment chars */ case '=': case ':': if(lpc > last) { free(name); name = calloc(1, 1 + lpc - last); memcpy(name, hostmap + last, lpc - last); } last = lpc + 1; break; /* Delimeter chars */ /* case ',': Potentially used to specify multiple ports */ case 0: case ';': case ' ': case '\t': if(name) { char *value = NULL; value = calloc(1, 1 + lpc - last); memcpy(value, hostmap + last, lpc - last); crm_debug("Adding alias '%s'='%s'", name, value); g_hash_table_replace(aliases, name, value); if(targets) { *targets = g_list_append(*targets, strdup(value)); } value=NULL; name=NULL; added++; } else if(lpc > last) { crm_debug("Parse error at offset %d near '%s'", lpc-last, hostmap+last); } last = lpc + 1; break; } if(hostmap[lpc] == 0) { break; } } if(added == 0) { crm_info("No host mappings detected in '%s'", hostmap); } free(name); return aliases; } static void parse_host_line(const char *line, GListPtr *output) { int lpc = 0; int max = 0; int last = 0; if(line) { max = strlen(line); } else { return; } /* Check for any complaints about additional parameters that the device doesn't understand */ if(strstr(line, "invalid") || strstr(line, "variable")) { crm_debug("Skipping: %s", line); return; } crm_trace("Processing: %s", line); /* Skip initial whitespace */ for(lpc = 0; lpc <= max && isspace(line[lpc]); lpc++) { last = lpc+1; } /* Now the actual content */ for(lpc = 0; lpc <= max; lpc++) { gboolean a_space = isspace(line[lpc]); if(a_space && lpc < max && isspace(line[lpc+1])) { /* fast-forward to the end of the spaces */ } else if(a_space || line[lpc] == ',' || line[lpc] == 0) { int rc = 1; char *entry = NULL; if(lpc != last) { entry = calloc(1, 1 + lpc - last); rc = sscanf(line+last, "%[a-zA-Z0-9_-.]", entry); } if(entry == NULL) { /* Skip */ } else if(rc != 1) { crm_warn("Could not parse (%d %d): %s", last, lpc, line+last); } else if(safe_str_neq(entry, "on") && safe_str_neq(entry, "off")) { crm_trace("Adding '%s'", entry); *output = g_list_append(*output, entry); entry = NULL; } free(entry); last = lpc + 1; } } } static GListPtr parse_host_list(const char *hosts) { int lpc = 0; int max = 0; int last = 0; GListPtr output = NULL; if(hosts == NULL) { return output; } max = strlen(hosts); for(lpc = 0; lpc <= max; lpc++) { if(hosts[lpc] == '\n' || hosts[lpc] == 0) { char *line = NULL; line = calloc(1, 2 + lpc - last); snprintf(line, 1 + lpc - last, "%s", hosts+last); parse_host_line(line, &output); free(line); last = lpc + 1; } } return output; } static stonith_device_t *build_device_from_xml(xmlNode *msg) { xmlNode *dev = get_xpath_object("//"F_STONITH_DEVICE, msg, LOG_ERR); stonith_device_t *device = NULL; device = calloc(1, sizeof(stonith_device_t)); device->id = crm_element_value_copy(dev, XML_ATTR_ID); device->agent = crm_element_value_copy(dev, "agent"); device->namespace = crm_element_value_copy(dev, "namespace"); device->params = xml2list(dev); device->work = mainloop_add_trigger(G_PRIORITY_HIGH, stonith_device_dispatch, device); /* TODO: Hook up priority */ return device; } static const char * target_list_type(stonith_device_t *dev) { const char *check_type = NULL; check_type = g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTCHECK); if(check_type == NULL) { if(g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTLIST)) { check_type = "static-list"; } else if(g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTMAP)) { check_type = "static-list"; } else { check_type = "dynamic-list"; } } - return check_type; + return check_type; +} + +static void +schedule_internal_command(stonith_device_t *device, + const char *action, + const char *victim, + int timeout, + void *internal_user_data, + void (*done_cb)(GPid pid, int rc, const char *output, gpointer user_data)) +{ + async_command_t *cmd = NULL; + + cmd = calloc(1, sizeof(async_command_t)); + + cmd->id = -1; + cmd->default_timeout = timeout ? timeout : 60; + cmd->timeout = cmd->default_timeout; + cmd->action = strdup(action); + cmd->victim = victim ? strdup(victim) : NULL; + cmd->device = strdup(device->id); + cmd->origin = strdup("st_internal_cmd"); + cmd->client = strdup("st_internal_client"); + cmd->client_name = strdup("st_internal_client_name"); + + cmd->internal_user_data = internal_user_data; + cmd->done_cb = done_cb; + + schedule_stonith_command(cmd, device); +} + +static gboolean string_in_list(GListPtr list, const char *item) +{ + int lpc = 0; + int max = g_list_length(list); + for(lpc = 0; lpc < max; lpc ++) { + const char *value = g_list_nth_data(list, lpc); + if(safe_str_eq(item, value)) { + return TRUE; + } + } + return FALSE; +} + +static void +status_search_cb(GPid pid, int rc, const char *output, gpointer user_data) +{ + async_command_t *cmd = user_data; + struct device_search_s *search = cmd->internal_user_data; + stonith_device_t *dev = cmd->device ? + g_hash_table_lookup(device_list, cmd->device) : NULL; + gboolean can = FALSE; + + if (!dev) { + search_devices_record_result(search, NULL, FALSE); + return; + } + + dev->active_pid = 0; + mainloop_set_trigger(dev->work); + + if (rc != 0) { + crm_err("Could not invoke %s: rc=%d", dev->id, rc); + + } else if(rc == 1 /* unkown */) { + crm_trace("Host %s is not known by %s", search->host, dev->id); + + } else if(rc == 0 /* active */ || rc == 2 /* inactive */) { + can = TRUE; + + } else { + crm_notice("Unkown result when testing if %s can fence %s: rc=%d", dev->id, search->host, rc); + } + search_devices_record_result(search, dev->id, can); } static void -update_dynamic_list(stonith_device_t *dev) +dynamic_list_search_cb(GPid pid, int rc, const char *output, gpointer user_data) { - time_t now = time(NULL); + async_command_t *cmd = user_data; + struct device_search_s *search = cmd->internal_user_data; + stonith_device_t *dev = cmd->device ? + g_hash_table_lookup(device_list, cmd->device) : NULL; + gboolean can_fence = FALSE; /* Host/alias must be in the list output to be eligable to be fenced * * Will cause problems if down'd nodes aren't listed or (for virtual nodes) * if the guest is still listed despite being moved to another machine */ + if (!dev) { + search_devices_record_result(search, NULL, FALSE); + return; + } - if(dev->targets_age < 0) { - crm_trace("Port list queries disabled for %s", dev->id); - - } else if(dev->targets == NULL || dev->targets_age + 60 < now) { - stonith_action_t *action = NULL; - char *output = NULL; - int rc = pcmk_ok; - int exec_rc = pcmk_ok; + dev->active_pid = 0; + mainloop_set_trigger(dev->work); - if(dev->active_pid != 0) { - crm_notice("Port list query can not execute because device is busy, using cache: %s", - dev->targets ? "YES" : "NO"); - return; - } + /* If we successfully got the targets earlier, don't disable. */ + if (rc != 0 && !dev->targets) { + crm_notice("Disabling port list queries for %s (%d): %s", + dev->id, rc, output); + /* Fall back to status */ + g_hash_table_replace(dev->params, strdup(STONITH_ATTR_HOSTCHECK), strdup("status")); - action = stonith_action_create(dev->agent, "list", NULL, 10, dev->params, NULL); - exec_rc = stonith_action_execute(action, &rc, &output); - - if(rc != 0 && dev->active_pid == 0) { - /* This device probably only supports a single - * connection, which appears to already be in use, - * likely involved in a montior or (less likely) - * metadata operation. - * - * Avoid disabling port list queries in the hope that - * the op would succeed next time - */ - crm_info("Couldn't query ports for %s. Call failed with rc=%d and active_pid=%d: %s", - dev->agent, rc, dev->active_pid, output); - - } else if(exec_rc < 0 || rc != 0) { - /* If we successfully got the targets earlier, don't disable. */ - if (dev->targets) { - return; - } - crm_notice("Disabling port list queries for %s (%d/%d): %s", - dev->id, exec_rc, rc, output); - dev->targets_age = -1; + g_list_free_full(dev->targets, free); + dev->targets = NULL; + } else if (!rc) { + crm_info("Refreshing port list for %s", dev->id); + g_list_free_full(dev->targets, free); + dev->targets = parse_host_list(output); + dev->targets_age = time(NULL); + } - /* Fall back to status */ - g_hash_table_replace(dev->params, strdup(STONITH_ATTR_HOSTCHECK), strdup("status")); + if (dev->targets) { + const char *alias = g_hash_table_lookup(dev->aliases, search->host); - g_list_free_full(dev->targets, free); - dev->targets = NULL; - } else { - crm_info("Refreshing port list for %s", dev->id); - g_list_free_full(dev->targets, free); - dev->targets = parse_host_list(output); - dev->targets_age = now; + if (!alias) { + alias = search->host; + } + if (string_in_list(dev->targets, alias)) { + can_fence = TRUE; } - - free(output); } + search_devices_record_result(search, dev->id, can_fence); } /*! * \internal * \brief Checks to see if an identical device already exists in the device_list */ static stonith_device_t * device_has_duplicate(stonith_device_t *device) { char *key = NULL; char *value = NULL; GHashTableIter gIter; stonith_device_t *dup = g_hash_table_lookup(device_list, device->id); if (!dup || safe_str_neq(dup->agent, device->agent)) { return NULL; } g_hash_table_iter_init(&gIter, device->params); while (g_hash_table_iter_next(&gIter, (void **) &key, (void **) &value)) { char *other_value = g_hash_table_lookup(dup->params, key); if (!other_value || safe_str_neq(other_value, value)) { return NULL; } } return dup; } static char * get_on_target_actions(const char *agent) { stonith_t *st = stonith_api_new(); char *actions = NULL; char *buffer = NULL; xmlNode *xml = NULL; xmlXPathObjectPtr xpath = NULL; int rc = 0, max = 0, lpc = 0; rc = st->cmds->metadata(st, st_opt_sync_call, agent, NULL, &buffer, 10); if (rc || !buffer) { goto on_target_actions_cleanup; } xml = string2xml(buffer); xpath = xpath_search(xml, "//action"); if (!xpath || !xpath->nodesetval) { goto on_target_actions_cleanup; } max = xpath->nodesetval->nodeNr; actions = calloc(1, 512); for (lpc = 0; lpc < max; lpc++) { const char *on_target = NULL; const char *action = NULL; xmlNode *match = getXpathResult(xpath, lpc); CRM_CHECK(match != NULL, continue); on_target = crm_element_value(match, "on_target"); action = crm_element_value(match, "name"); if (action && safe_str_eq(on_target, "true")) { if (strlen(actions)) { g_strlcat(actions, " ", 512); } g_strlcat(actions, action, 512); } } if (!strlen(actions)) { free(actions); actions = NULL; } on_target_actions_cleanup: free(buffer); stonith_api_delete(st); free_xml(xml); return actions; } int stonith_device_register(xmlNode *msg, const char **desc, gboolean from_cib) { const char *value = NULL; stonith_device_t *dup = NULL; stonith_device_t *device = build_device_from_xml(msg); value = g_hash_table_lookup(device->params, STONITH_ATTR_HOSTLIST); if(value) { device->targets = parse_host_list(value); } value = g_hash_table_lookup(device->params, STONITH_ATTR_HOSTMAP); device->aliases = build_port_aliases(value, &(device->targets)); - value = target_list_type(device); - if (safe_str_eq(value, "dynamic-list")) { - /* set the dynamic list during the register to guarantee we have - * targets cached */ - update_dynamic_list(device); - } - if ((dup = device_has_duplicate(device))) { crm_notice("Device '%s' already existed in device list (%d active devices)", device->id, g_hash_table_size(device_list)); free_device(device); device = dup; } else { stonith_device_t *old = g_hash_table_lookup(device_list, device->id); if (from_cib && old && old->api_registered) { /* If the cib is writing over an entry that is shared with a stonith client, * copy any pending ops that currently exist on the old entry to the new one. * Otherwise the pending ops will be reported as failures */ device->pending_ops = old->pending_ops; device->api_registered = TRUE; old->pending_ops = NULL; if (device->pending_ops) { mainloop_set_trigger(device->work); } } g_hash_table_replace(device_list, device->id, device); crm_notice("Added '%s' to the device list (%d active devices)", device->id, g_hash_table_size(device_list)); if ((device->on_target_actions = get_on_target_actions(device->agent))) { crm_info("The fencing device '%s' requires actions (%s) to be executed on the target node", device->id, device->on_target_actions); } } if(desc) { *desc = device->id; } if (from_cib) { device->cib_registered = TRUE; } else { device->api_registered = TRUE; } return pcmk_ok; } int stonith_device_remove(const char *id, gboolean from_cib) { stonith_device_t *device = g_hash_table_lookup(device_list, id); if (!device) { crm_info("Device '%s' not found (%d active devices)", id, g_hash_table_size(device_list)); return pcmk_ok; } if (from_cib) { device->cib_registered = FALSE; } else { device->verified = FALSE; device->api_registered = FALSE; } if (!device->cib_registered && !device->api_registered) { g_hash_table_remove(device_list, id); crm_info("Removed '%s' from the device list (%d active devices)", id, g_hash_table_size(device_list)); } return pcmk_ok; } static int count_active_levels(stonith_topology_t *tp) { int lpc = 0; int count = 0; for(lpc = 0; lpc < ST_LEVEL_MAX; lpc++) { if(tp->levels[lpc] != NULL) { count++; } } return count; } void free_topology_entry(gpointer data) { stonith_topology_t *tp = data; int lpc = 0; for(lpc = 0; lpc < ST_LEVEL_MAX; lpc++) { if(tp->levels[lpc] != NULL) { g_list_free_full(tp->levels[lpc], free); } } free(tp->node); free(tp); } int stonith_level_register(xmlNode *msg, char **desc) { int id = 0; int rc = pcmk_ok; xmlNode *child = NULL; xmlNode *level = get_xpath_object("//"F_STONITH_LEVEL, msg, LOG_ERR); const char *node = crm_element_value(level, F_STONITH_TARGET); stonith_topology_t *tp = g_hash_table_lookup(topology, node); crm_element_value_int(level, XML_ATTR_ID, &id); if(desc) { *desc = g_strdup_printf("%s[%d]", node, id); } if(id <= 0 || id >= ST_LEVEL_MAX) { return -EINVAL; } if(tp == NULL) { tp = calloc(1, sizeof(stonith_topology_t)); tp->node = strdup(node); g_hash_table_replace(topology, tp->node, tp); crm_trace("Added %s to the topology (%d active entries)", node, g_hash_table_size(topology)); } if(tp->levels[id] != NULL) { crm_info("Adding to the existing %s[%d] topology entry (%d active entries)", node, id, count_active_levels(tp)); } for (child = __xml_first_child(level); child != NULL; child = __xml_next(child)) { const char *device = ID(child); crm_trace("Adding device '%s' for %s (%d)", device, node, id); tp->levels[id] = g_list_append(tp->levels[id], strdup(device)); } crm_info("Node %s has %d active fencing levels", node, count_active_levels(tp)); return rc; } int stonith_level_remove(xmlNode *msg, char **desc) { int id = 0; xmlNode *level = get_xpath_object("//"F_STONITH_LEVEL, msg, LOG_ERR); const char *node = crm_element_value(level, F_STONITH_TARGET); stonith_topology_t *tp = g_hash_table_lookup(topology, node); if(desc) { *desc = g_strdup_printf("%s[%d]", node, id); } crm_element_value_int(level, XML_ATTR_ID, &id); if(tp == NULL) { crm_info("Node %s not found (%d active entries)", node, g_hash_table_size(topology)); return pcmk_ok; } else if(id < 0 || id >= ST_LEVEL_MAX) { return -EINVAL; } if(id == 0 && g_hash_table_remove(topology, node)) { crm_info("Removed all %s related entries from the topology (%d active entries)", node, g_hash_table_size(topology)); } else if(id > 0 && tp->levels[id] != NULL) { g_list_free_full(tp->levels[id], free); tp->levels[id] = NULL; crm_info("Removed entry '%d' from %s's topology (%d active entries remaining)", id, node, count_active_levels(tp)); } return pcmk_ok; } -static gboolean string_in_list(GListPtr list, const char *item) -{ - int lpc = 0; - int max = g_list_length(list); - for(lpc = 0; lpc < max; lpc ++) { - const char *value = g_list_nth_data(list, lpc); - if(safe_str_eq(item, value)) { - return TRUE; - } - } - return FALSE; -} - static int stonith_device_action(xmlNode *msg, char **output) { int rc = pcmk_ok; xmlNode *dev = get_xpath_object("//"F_STONITH_DEVICE, msg, LOG_ERR); const char *id = crm_element_value(dev, F_STONITH_DEVICE); async_command_t *cmd = NULL; stonith_device_t *device = NULL; if(id) { crm_trace("Looking for '%s'", id); device = g_hash_table_lookup(device_list, id); } if(device) { cmd = create_async_command(msg); if(cmd == NULL) { free_device(device); return -EPROTO; } schedule_stonith_command(cmd, device); rc = -EINPROGRESS; } else { crm_info("Device %s not found", id?id:""); rc = -ENODEV; } return rc; } -static gboolean can_fence_host_with_device(stonith_device_t *dev, const char *host, const char *action) +static void +search_devices_record_result(struct device_search_s *search, const char *device, gboolean can_fence) +{ + search->replies_received++; + + if (can_fence) { + search->capable = g_list_append(search->capable, strdup(device)); + } + + if (search->replies_needed == search->replies_received) { + + crm_debug("Finished Search. %d devices can perform action (%s) on node %s", + g_list_length(search->capable), + search->action ? search->action : "", + search->host ? search->host : ""); + + search->callback(search->capable, search->user_data); + free(search->host); + free(search->action); + free(search); + } +} + +static void +can_fence_host_with_device(stonith_device_t *dev, struct device_search_s *search) { gboolean can = FALSE; - const char *alias = host; const char *check_type = NULL; + const char *host = search->host; + const char *alias = host; if(dev == NULL) { - return FALSE; - + goto search_report_results; } else if(host == NULL) { - return TRUE; + can = TRUE; + goto search_report_results; } if (dev->on_target_actions && - action && - strstr(dev->on_target_actions, action) && + search->action && + strstr(dev->on_target_actions, search->action) && safe_str_neq(host, stonith_our_uname)) { /* this device can only execute this action on the target node */ - return FALSE; + goto search_report_results; } if(g_hash_table_lookup(dev->aliases, host)) { alias = g_hash_table_lookup(dev->aliases, host); } check_type = target_list_type(dev); if(safe_str_eq(check_type, "none")) { can = TRUE; } else if(safe_str_eq(check_type, "static-list")) { /* Presence in the hostmap is sufficient * Only use if all hosts on which the device can be active can always fence all listed hosts */ if(string_in_list(dev->targets, host)) { can = TRUE; } else if(g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTMAP) && g_hash_table_lookup(dev->aliases, host)) { can = TRUE; } } else if(safe_str_eq(check_type, "dynamic-list")) { - update_dynamic_list(dev); - - if(string_in_list(dev->targets, alias)) { - can = TRUE; + time_t now = time(NULL); + if (dev->targets == NULL || dev->targets_age + 60 < now) { + schedule_internal_command(dev, + "list", + NULL, + search->per_device_timeout, + search, + dynamic_list_search_cb); + + /* we'll respond to this search request async in the cb */ + return; } - } else if(safe_str_eq(check_type, "status")) { - int rc = 0; - int exec_rc = 0; - stonith_action_t *action = NULL; - /* Run the status operation for the device/target combination - * Will cause problems if the device doesn't return 2 for down'd nodes or - * (for virtual nodes) if the device doesn't return 1 for guests that - * have been moved to another host - */ - - action = stonith_action_create(dev->agent, "status", host, 5, dev->params, dev->aliases); - exec_rc = stonith_action_execute(action, &rc, NULL); - - if(exec_rc != 0) { - crm_err("Could not invoke %s: rc=%d", dev->id, exec_rc); - - } else if(rc == 1 /* unkown */) { - crm_trace("Host %s is not known by %s", host, dev->id); - - } else if(rc == 0 /* active */ || rc == 2 /* inactive */) { + if (string_in_list(dev->targets, alias)) { can = TRUE; - - } else { - crm_notice("Unkown result when testing if %s can fence %s: rc=%d", dev->id, host, rc); } + } else if(safe_str_eq(check_type, "status")) { + schedule_internal_command(dev, + "status", + search->host, + search->per_device_timeout, + search, + status_search_cb); + /* we'll respond to this search request async in the cb */ + return; } else { crm_err("Unknown check type: %s", check_type); } if(safe_str_eq(host, alias)) { crm_info("%s can%s fence %s: %s", dev->id, can?"":" not", host, check_type); } else { crm_info("%s can%s fence %s (aka. '%s'): %s", dev->id, can?"":" not", host, alias, check_type); } - return can; -} - -struct device_search_s -{ - const char *host; - const char *action; - GListPtr capable; -}; +search_report_results: + search_devices_record_result(search, dev->id, can); +} -static void search_devices( - gpointer key, gpointer value, gpointer user_data) +static void +search_devices(gpointer key, gpointer value, gpointer user_data) { stonith_device_t *dev = value; struct device_search_s *search = user_data; - if(can_fence_host_with_device(dev, search->host, search->action)) { - search->capable = g_list_append(search->capable, value); - } + + can_fence_host_with_device(dev, search); } -static int stonith_query(xmlNode *msg, xmlNode **list) +#define DEFAULT_QUERY_TIMEOUT 20 +static void +get_capable_devices(const char *host, const char *action, int timeout, void *user_data, void (*callback)(GList *devices, void *user_data)) { - struct device_search_s search; - int available_devices = 0; - const char *action = NULL; - xmlNode *dev = get_xpath_object("//@"F_STONITH_ACTION, msg, LOG_DEBUG_3); + struct device_search_s *search; + int per_device_timeout = DEFAULT_QUERY_TIMEOUT; + int devices_needing_async_query = 0; + char *key = NULL; + const char *check_type = NULL; + GHashTableIter gIter; + stonith_device_t *device = NULL; - search.host = NULL; - search.capable = NULL; + if (!g_hash_table_size(device_list)) { + callback(NULL, user_data); + return; + } - if(dev) { - const char *device = crm_element_value(dev, F_STONITH_DEVICE); - search.host = crm_element_value(dev, F_STONITH_TARGET); - search.action = crm_element_value(dev, F_STONITH_ACTION); - if(device && safe_str_eq(device, "manual_ack")) { - /* No query necessary */ - if(list) { - *list = NULL; - } - return pcmk_ok; + search = calloc(1, sizeof(struct device_search_s)); + if (!search) { + callback(NULL, user_data); + return; + } + + g_hash_table_iter_init(&gIter, device_list); + while (g_hash_table_iter_next(&gIter, (void **) &key, (void **) &device)) { + check_type = target_list_type(device); + if (safe_str_eq(check_type, "status") || safe_str_eq(check_type, "dynamic-list")) { + devices_needing_async_query++; } + } - action = search.action; + /* If we have devices that require an async event in order to know what + * nodes they can fence, we have to give the events a timeout. The total + * query timeout is divided among those events. */ + if (devices_needing_async_query) { + per_device_timeout = timeout / devices_needing_async_query; + if (!per_device_timeout) { + crm_err("stonith-timeout duration %d is too low, raise the duration to %d seconds", + timeout, DEFAULT_QUERY_TIMEOUT * devices_needing_async_query); + per_device_timeout = DEFAULT_QUERY_TIMEOUT; + } else if (per_device_timeout < DEFAULT_QUERY_TIMEOUT) { + crm_notice("stonith-timeout duration %d is low for the current configuration. Consider raising it to %d seconds", + timeout, DEFAULT_QUERY_TIMEOUT * devices_needing_async_query); + } } - crm_log_xml_debug(msg, "Query"); + search->host = host ? strdup(host) : NULL; + search->action = action ? strdup(action) : NULL; + search->per_device_timeout = per_device_timeout; + /* We are guaranteed this many replies. Even if a device gets + * unregistered some how during the async search, we will get + * the correct number of replies. */ + search->replies_needed = g_hash_table_size(device_list); + search->callback = callback; + search->user_data = user_data; + /* kick off the search */ + + crm_debug("Searching through %d devices to see what is capable of action (%s) for target %s", + search->replies_needed, + search->action ? search->action : "", + search->host ? search->host : ""); + g_hash_table_foreach(device_list, search_devices, search); +} + +struct st_query_data { + xmlNode *reply; + char *remote_peer; + char *client_id; + char *target; + char *action; + int call_options; +}; + +static void +stonith_query_capable_device_cb(GList *devices, void *user_data) +{ + struct st_query_data *query = user_data; + int available_devices = 0; + xmlNode *dev = NULL; + xmlNode *list = NULL; + GListPtr lpc = NULL; + + /* Pack the results into data */ + list = create_xml_node(NULL, __FUNCTION__); + crm_xml_add(list, F_STONITH_TARGET, query->target); + for(lpc = devices; lpc != NULL; lpc = lpc->next) { + stonith_device_t *device = g_hash_table_lookup(device_list, lpc->data); + int action_specific_timeout; + + if (!device) { + /* It is possible the device got unregistered while + * determining who can fence the target */ + continue; + } + + available_devices++; + + action_specific_timeout = get_action_timeout(device, query->action, 0); + dev = create_xml_node(list, F_STONITH_DEVICE); + crm_xml_add(dev, XML_ATTR_ID, device->id); + crm_xml_add(dev, "namespace", device->namespace); + crm_xml_add(dev, "agent", device->agent); + crm_xml_add_int(dev, F_STONITH_DEVICE_VERIFIED, device->verified); + if (action_specific_timeout) { + crm_xml_add_int(dev, F_STONITH_ACTION_TIMEOUT, action_specific_timeout); + } + if (query->target == NULL) { + xmlNode *attrs = create_xml_node(dev, XML_TAG_ATTRS); + g_hash_table_foreach(device->params, hash2field, attrs); + } + } - g_hash_table_foreach(device_list, search_devices, &search); - available_devices = g_list_length(search.capable); - if(search.host) { + crm_xml_add_int(list, "st-available-devices", available_devices); + if (query->target) { crm_debug("Found %d matching devices for '%s'", - available_devices, search.host); + available_devices, query->target); } else { crm_debug("%d devices installed", available_devices); } - /* Pack the results into data */ - if(list) { - GListPtr lpc = NULL; - *list = create_xml_node(NULL, __FUNCTION__); - crm_xml_add(*list, F_STONITH_TARGET, search.host); - crm_xml_add_int(*list, "st-available-devices", available_devices); - for(lpc = search.capable; lpc != NULL; lpc = lpc->next) { - stonith_device_t *device = (stonith_device_t*)lpc->data; - int action_specific_timeout = get_action_timeout(device, action, 0); - - dev = create_xml_node(*list, F_STONITH_DEVICE); - crm_xml_add(dev, XML_ATTR_ID, device->id); - crm_xml_add(dev, "namespace", device->namespace); - crm_xml_add(dev, "agent", device->agent); - crm_xml_add_int(dev, F_STONITH_DEVICE_VERIFIED, device->verified); - if (action_specific_timeout) { - crm_xml_add_int(dev, F_STONITH_ACTION_TIMEOUT, action_specific_timeout); - } - if(search.host == NULL) { - xmlNode *attrs = create_xml_node(dev, XML_TAG_ATTRS); - g_hash_table_foreach(device->params, hash2field, attrs); - } + + if (list != NULL) { + crm_trace("Attaching query list output"); + add_message_xml(query->reply, F_STONITH_CALLDATA, list); + } + stonith_send_reply(query->reply, query->call_options, query->remote_peer, query->client_id); + + free_xml(query->reply); + free(query->remote_peer); + free(query->client_id); + free(query->target); + free(query->action); + free(query); + free_xml(list); + g_list_free_full(devices, free); +} + +static void +stonith_query(xmlNode *msg, const char *remote_peer, const char *client_id, int call_options) +{ + struct st_query_data *query = NULL; + const char *action = NULL; + const char *target = NULL; + int timeout = 0; + xmlNode *dev = get_xpath_object("//@"F_STONITH_ACTION, msg, LOG_DEBUG_3); + + if(dev) { + const char *device = crm_element_value(dev, F_STONITH_DEVICE); + target = crm_element_value(dev, F_STONITH_TARGET); + action = crm_element_value(dev, F_STONITH_ACTION); + crm_element_value_int(dev, F_STONITH_TIMEOUT, &timeout); + if(device && safe_str_eq(device, "manual_ack")) { + /* No query or reply necessary */ + return; } } - g_list_free(search.capable); + crm_log_xml_debug(msg, "Query"); + query = calloc(1, sizeof(struct st_query_data)); - return available_devices; + query->reply = stonith_construct_reply(msg, NULL, NULL, pcmk_ok); + query->remote_peer = remote_peer ? strdup(remote_peer) : NULL; + query->client_id = client_id ? strdup(client_id) : NULL; + query->target = target ? strdup(target) : NULL; + query->action = action ? strdup(action) : NULL; + query->call_options = call_options; + + get_capable_devices(target, action, timeout, query, stonith_query_capable_device_cb); } #define ST_LOG_OUTPUT_MAX 512 static void log_operation(async_command_t *cmd, int rc, int pid, const char *next, const char *output) { if(rc == 0) { next = NULL; } if(cmd->victim != NULL) { do_crm_log(rc==0?LOG_NOTICE:LOG_ERR, "Operation '%s' [%d] (call %d from %s) for host '%s' with device '%s' returned: %d (%s)%s%s", cmd->action, pid, cmd->id, cmd->client_name, cmd->victim, cmd->device, rc, pcmk_strerror(rc), next?". Trying: ":"", next?next:""); } else { do_crm_log_unlikely(rc==0?LOG_DEBUG:LOG_NOTICE, "Operation '%s' [%d] for device '%s' returned: %d (%s)%s%s", cmd->action, pid, cmd->device, rc, pcmk_strerror(rc), next?". Trying: ":"", next?next:""); } if(output) { /* Logging the whole string confuses syslog when the string is xml */ char *local_copy = strdup(output); int lpc = 0, last = 0, more = strlen(local_copy); /* Give the log output some reasonable boundary */ more = more > ST_LOG_OUTPUT_MAX ? ST_LOG_OUTPUT_MAX : more; for(lpc = 0; lpc < more; lpc++) { if(local_copy[lpc] == '\n' || local_copy[lpc] == 0) { local_copy[lpc] = 0; do_crm_log(rc==0?LOG_INFO:LOG_WARNING, "%s: %s", cmd->device, local_copy+last); last = lpc+1; } } crm_debug("%s: %s (total %d bytes)", cmd->device, local_copy+last, more); free(local_copy); } } static void stonith_send_async_reply(async_command_t *cmd, const char *output, int rc, GPid pid) { xmlNode *reply = NULL; gboolean bcast = FALSE; reply = stonith_construct_async_reply(cmd, output, NULL, rc); if(safe_str_eq(cmd->action, "metadata")) { /* Too verbose to log */ crm_trace("Metadata query for %s", cmd->device); output = NULL; } else if(crm_str_eq(cmd->action, "monitor", TRUE) || crm_str_eq(cmd->action, "list", TRUE) || crm_str_eq(cmd->action, "status", TRUE)) { crm_trace("Never broadcast %s replies", cmd->action); } else if(!stand_alone && safe_str_eq(cmd->origin, cmd->victim)) { crm_trace("Broadcast %s reply for %s", cmd->action, cmd->victim); crm_xml_add(reply, F_SUBTYPE, "broadcast"); bcast = TRUE; } log_operation(cmd, rc, pid, NULL, output); crm_log_xml_trace(reply, "Reply"); if(bcast) { crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); } else if(cmd->origin) { crm_trace("Directed reply to %s", cmd->origin); send_cluster_message(crm_get_peer(0, cmd->origin), crm_msg_stonith_ng, reply, FALSE); } else { crm_trace("Directed local %ssync reply to %s", (cmd->options & st_opt_sync_call)?"":"a-", cmd->client_name); do_local_reply(reply, cmd->client, cmd->options & st_opt_sync_call, FALSE); } if(stand_alone) { /* Do notification with a clean data object */ xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); crm_xml_add_int(notify_data, F_STONITH_RC, rc); crm_xml_add(notify_data, F_STONITH_TARGET, cmd->victim); crm_xml_add(notify_data, F_STONITH_OPERATION, cmd->op); crm_xml_add(notify_data, F_STONITH_DELEGATE, cmd->device); crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client); do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data); } free_xml(reply); } static void cancel_stonith_command(async_command_t *cmd) { stonith_device_t *device; CRM_CHECK(cmd != NULL, return); if (!cmd->device) { return; } device = g_hash_table_lookup(device_list, cmd->device); if (device) { crm_trace("Cancel scheduled %s on %s", cmd->action, device->id); device->pending_ops = g_list_remove(device->pending_ops, cmd); } } #define READ_MAX 500 static void st_child_done(GPid pid, int rc, const char *output, gpointer user_data) { stonith_device_t *device = NULL; async_command_t *cmd = user_data; GListPtr gIter = NULL; GListPtr gIterNext = NULL; CRM_CHECK(cmd != NULL, return); active_children--; /* The device is ready to do something else now */ device = g_hash_table_lookup(device_list, cmd->device); if(device) { device->active_pid = 0; if (rc == pcmk_ok && (safe_str_eq(cmd->action, "list") || safe_str_eq(cmd->action, "monitor") || safe_str_eq(cmd->action, "status"))) { device->verified = TRUE; } mainloop_set_trigger(device->work); } crm_trace("Operation %s on %s completed with rc=%d (%d remaining)", cmd->action, cmd->device, rc, g_list_length(cmd->device_next)); if(rc != 0 && cmd->device_next) { - stonith_device_t *dev = cmd->device_next->data; + stonith_device_t *dev = g_hash_table_lookup(device_list, cmd->device_next->data); - log_operation(cmd, rc, pid, dev->id, output); + if (dev) { + log_operation(cmd, rc, pid, dev->id, output); - cmd->device_next = cmd->device_next->next; - schedule_stonith_command(cmd, dev); - /* Prevent cmd from being freed */ - cmd = NULL; - goto done; + cmd->device_next = cmd->device_next->next; + schedule_stonith_command(cmd, dev); + /* Prevent cmd from being freed */ + cmd = NULL; + goto done; + } } if(rc > 0) { rc = -pcmk_err_generic; } stonith_send_async_reply(cmd, output, rc, pid); if(rc != 0) { goto done; } /* Check to see if any operations are scheduled to do the exact * same thing that just completed. If so, rather than * performing the same fencing operation twice, return the result * of this operation for all pending commands it matches. */ for (gIter = cmd_list; gIter != NULL; gIter = gIterNext) { async_command_t *cmd_other = gIter->data; gIterNext = gIter->next; if(cmd == cmd_other) { continue; } /* A pending scheduled command matches the command that just finished if. * 1. The client connections are different. * 2. The node victim is the same. * 3. The fencing action is the same. * 4. The device scheduled to execute the action is the same. */ if(safe_str_eq(cmd->client, cmd_other->client) || safe_str_neq(cmd->victim, cmd_other->victim) || safe_str_neq(cmd->action, cmd_other->action) || safe_str_neq(cmd->device, cmd_other->device)) { continue; } crm_notice("Merging stonith action %s for node %s originating from client %s with identical stonith request from client %s", cmd_other->action, cmd_other->victim, cmd_other->client_name, cmd->client_name); cmd_list = g_list_remove_link(cmd_list, gIter); stonith_send_async_reply(cmd_other, output, rc, pid); cancel_stonith_command(cmd_other); free_async_command(cmd_other); g_list_free_1(gIter); } done: free_async_command(cmd); } static gint sort_device_priority(gconstpointer a, gconstpointer b) { const stonith_device_t *dev_a = a; const stonith_device_t *dev_b = a; if(dev_a->priority > dev_b->priority) { return -1; } else if(dev_a->priority < dev_b->priority) { return 1; } return 0; } -static int stonith_fence(xmlNode *msg) +static void +stonith_fence_get_devices_cb(GList *devices, void *user_data) +{ + async_command_t *cmd = user_data; + stonith_device_t *device = NULL; + + crm_info("Found %d matching devices for '%s'", g_list_length(devices), cmd->victim); + + if (g_list_length(devices) > 0) { + /* Order based on priority */ + devices = g_list_sort(devices, sort_device_priority); + device = g_hash_table_lookup(device_list, devices->data); + + if (device) { + cmd->device_list = devices; + cmd->device_next = devices->next; + devices = NULL; /* list owned by cmd now */ + } + } + + /* we have a device, schedule it for fencing. */ + if (device) { + schedule_stonith_command(cmd, device); + /* in progress */ + return; + } + + /* no device found! */ + stonith_send_async_reply(cmd, NULL, -EHOSTUNREACH, 0); + + free_async_command(cmd); + g_list_free_full(devices, free); +} + +static int +stonith_fence(xmlNode *msg) { - int options = 0; const char *device_id = NULL; + int rc = -EHOSTUNREACH; stonith_device_t *device = NULL; async_command_t *cmd = create_async_command(msg); xmlNode *dev = get_xpath_object("//@"F_STONITH_TARGET, msg, LOG_ERR); if(cmd == NULL) { return -EPROTO; } device_id = crm_element_value(dev, F_STONITH_DEVICE); if(device_id) { device = g_hash_table_lookup(device_list, device_id); if(device == NULL) { crm_err("Requested device '%s' is not available", device_id); + } else { + schedule_stonith_command(cmd, device); + rc = -EINPROGRESS; } - } else { - struct device_search_s search; - - search.capable = NULL; - search.host = crm_element_value(dev, F_STONITH_TARGET); - search.action = crm_element_value(dev, F_STONITH_ACTION); + const char *host = crm_element_value(dev, F_STONITH_TARGET); - crm_element_value_int(msg, F_STONITH_CALLOPTS, &options); - if(options & st_opt_cs_nodeid) { - int nodeid = crm_atoi(search.host, NULL); + if(cmd->options & st_opt_cs_nodeid) { + int nodeid = crm_atoi(host, NULL); crm_node_t *node = crm_get_peer(nodeid, NULL); if(node) { - search.host = node->uname; - } - } - - g_hash_table_foreach(device_list, search_devices, &search); - crm_info("Found %d matching devices for '%s'", g_list_length(search.capable), search.host); - - if(g_list_length(search.capable) > 0) { - /* Order based on priority */ - search.capable = g_list_sort(search.capable, sort_device_priority); - - device = search.capable->data; - - if(g_list_length(search.capable) > 1) { - cmd->device_list = search.capable; - cmd->device_next = cmd->device_list->next; - } else { - g_list_free(search.capable); + host = node->uname; } } + get_capable_devices(host, cmd->action, cmd->default_timeout, cmd, stonith_fence_get_devices_cb); + rc = -EINPROGRESS; } - if(device) { - schedule_stonith_command(cmd, device); - return -EINPROGRESS; - } - - free_async_command(cmd); - return -EHOSTUNREACH; + return rc; } -xmlNode *stonith_construct_reply(xmlNode *request, char *output, xmlNode *data, int rc) +xmlNode *stonith_construct_reply(xmlNode *request, const char *output, xmlNode *data, int rc) { int lpc = 0; xmlNode *reply = NULL; const char *name = NULL; const char *value = NULL; const char *names[] = { F_STONITH_OPERATION, F_STONITH_CALLID, F_STONITH_CLIENTID, F_STONITH_CLIENTNAME, F_STONITH_REMOTE_OP_ID, F_STONITH_CALLOPTS }; crm_trace("Creating a basic reply"); reply = create_xml_node(NULL, T_STONITH_REPLY); crm_xml_add(reply, "st_origin", __FUNCTION__); crm_xml_add(reply, F_TYPE, T_STONITH_NG); crm_xml_add(reply, "st_output", output); crm_xml_add_int(reply, F_STONITH_RC, rc); CRM_CHECK(request != NULL, crm_warn("Can't create a sane reply"); return reply); for(lpc = 0; lpc < DIMOF(names); lpc++) { name = names[lpc]; value = crm_element_value(request, name); crm_xml_add(reply, name, value); } if(data != NULL) { crm_trace("Attaching reply output"); add_message_xml(reply, F_STONITH_CALLDATA, data); } return reply; } static xmlNode * stonith_construct_async_reply(async_command_t *cmd, const char *output, xmlNode *data, int rc) { xmlNode *reply = NULL; crm_trace("Creating a basic reply"); reply = create_xml_node(NULL, T_STONITH_REPLY); crm_xml_add(reply, "st_origin", __FUNCTION__); crm_xml_add(reply, F_TYPE, T_STONITH_NG); crm_xml_add(reply, F_STONITH_OPERATION, cmd->op); crm_xml_add(reply, F_STONITH_DEVICE, cmd->device); crm_xml_add(reply, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id); crm_xml_add(reply, F_STONITH_CLIENTID, cmd->client); crm_xml_add(reply, F_STONITH_CLIENTNAME, cmd->client_name); crm_xml_add(reply, F_STONITH_TARGET, cmd->victim); crm_xml_add(reply, F_STONITH_ACTION, cmd->op); crm_xml_add(reply, F_STONITH_ORIGIN, cmd->origin); crm_xml_add_int(reply, F_STONITH_CALLID, cmd->id); crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options); crm_xml_add_int(reply, F_STONITH_RC, rc); crm_xml_add(reply, "st_output", output); if(data != NULL) { crm_info("Attaching reply output"); add_message_xml(reply, F_STONITH_CALLDATA, data); } return reply; } /*! * \internal * \brief Determine if we need to use an alternate node to * fence the target. If so return that node's uname * * \retval NULL, no alternate host * \retval uname, uname of alternate host to use */ static const char * check_alternate_host(const char *target) { const char *alternate_host = NULL; if(g_hash_table_lookup(topology, target) && safe_str_eq(target, stonith_our_uname)) { GHashTableIter gIter; crm_node_t *entry = NULL; int membership = crm_proc_plugin | crm_proc_heartbeat | crm_proc_cpg; g_hash_table_iter_init(&gIter, crm_peer_cache); while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { crm_trace("Checking for %s.%d != %s", entry->uname, entry->id, target); if(entry->uname && (entry->processes & membership) && safe_str_neq(entry->uname, target)) { alternate_host = entry->uname; break; } } if(alternate_host == NULL) { crm_err("No alternate host available to handle complex self fencing request"); g_hash_table_iter_init(&gIter, crm_peer_cache); while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { crm_notice("Peer[%d] %s", entry->id, entry->uname); } } } return alternate_host; } +static void +stonith_send_reply(xmlNode *reply, int call_options, const char *remote_peer, const char *client_id) +{ + if (remote_peer) { + send_cluster_message(crm_get_peer(0, remote_peer), crm_msg_stonith_ng, reply, FALSE); + } else { + do_local_reply(reply, client_id, call_options & st_opt_sync_call, remote_peer!=NULL); + } +} + static int handle_request(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode *request, const char *remote_peer) { int call_options = 0; int rc = -EOPNOTSUPP; - gboolean always_reply = FALSE; - - xmlNode *reply = NULL; xmlNode *data = NULL; + xmlNode *reply = NULL; char *output = NULL; const char *op = crm_element_value(request, F_STONITH_OPERATION); const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); if(is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(client == NULL || client->request_id == id); } if(crm_str_eq(op, CRM_OP_REGISTER, TRUE)) { xmlNode *reply = create_xml_node(NULL, "reply"); CRM_ASSERT(client); crm_xml_add(reply, F_STONITH_OPERATION, CRM_OP_REGISTER); crm_xml_add(reply, F_STONITH_CLIENTID, client->id); crm_ipcs_send(client->channel, id, reply, FALSE); client->request_id = 0; free_xml(reply); return 0; } else if(crm_str_eq(op, STONITH_OP_EXEC, TRUE)) { rc = stonith_device_action(request, &output); } else if (crm_str_eq(op, STONITH_OP_TIMEOUT_UPDATE, TRUE)) { const char *call_id = crm_element_value(request, F_STONITH_CALLID); const char *client_id = crm_element_value(request, F_STONITH_CLIENTID); int op_timeout = 0; crm_element_value_int(request, F_STONITH_TIMEOUT, &op_timeout); do_stonith_async_timeout_update(client_id, call_id, op_timeout); return 0; } else if(crm_str_eq(op, STONITH_OP_QUERY, TRUE)) { if (remote_peer) { create_remote_stonith_op(client_id, request, TRUE); /* Record it for the future notification */ } - rc = stonith_query(request, &data); - always_reply = TRUE; - if(!data) { - return 0; - } + stonith_query(request, remote_peer, client_id, call_options); + return 0; } else if(crm_str_eq(op, T_STONITH_NOTIFY, TRUE)) { const char *flag_name = NULL; CRM_ASSERT(client); flag_name = crm_element_value(request, F_STONITH_NOTIFY_ACTIVATE); if(flag_name) { crm_debug("Setting %s callbacks for %s (%s): ON", flag_name, client->name, client->id); client->flags |= get_stonith_flag(flag_name); } flag_name = crm_element_value(request, F_STONITH_NOTIFY_DEACTIVATE); if(flag_name) { crm_debug("Setting %s callbacks for %s (%s): off", flag_name, client->name, client->id); client->flags |= get_stonith_flag(flag_name); } if(flags & crm_ipc_client_response) { crm_ipcs_send_ack(client->channel, id, "ack", __FUNCTION__, __LINE__); client->request_id = 0; } return 0; } else if(crm_str_eq(op, STONITH_OP_RELAY, TRUE)) { xmlNode *dev = get_xpath_object("//@"F_STONITH_TARGET, request, LOG_TRACE); crm_notice("Peer %s has received a forwarded fencing request from %s to fence (%s) peer %s", stonith_our_uname, client ? client->name : remote_peer, crm_element_value(dev, F_STONITH_ACTION), crm_element_value(dev, F_STONITH_TARGET)); if(initiate_remote_stonith_op(NULL, request, FALSE) != NULL) { rc = -EINPROGRESS; } } else if(crm_str_eq(op, STONITH_OP_FENCE, TRUE)) { if(remote_peer || stand_alone) { rc = stonith_fence(request); } else if(call_options & st_opt_manual_ack) { remote_fencing_op_t *rop = initiate_remote_stonith_op(client, request, TRUE); rc = stonith_manual_ack(request, rop); } else { const char *alternate_host = NULL; xmlNode *dev = get_xpath_object("//@"F_STONITH_TARGET, request, LOG_TRACE); const char *target = crm_element_value(dev, F_STONITH_TARGET); const char *action = crm_element_value(dev, F_STONITH_ACTION); const char *device = crm_element_value(dev, F_STONITH_DEVICE); if(client) { int tolerance = 0; crm_notice("Client %s.%.8s wants to fence (%s) '%s' with device '%s'", client->name, client->id, action, target, device?device:"(any)"); crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance); if(stonith_check_fence_tolerance(tolerance, target, action)) { rc = 0; goto done; } } else { crm_notice("Peer %s wants to fence (%s) '%s' with device '%s'", remote_peer, action, target, device?device:"(any)"); } alternate_host = check_alternate_host(target); if(alternate_host) { crm_notice("Forwarding complex self fencing request to peer %s", alternate_host); crm_xml_add(request, F_STONITH_OPERATION, STONITH_OP_RELAY); crm_xml_add(request, F_STONITH_CLIENTID, client->id); send_cluster_message(crm_get_peer(0, alternate_host), crm_msg_stonith_ng, request, FALSE); rc = -EINPROGRESS; } else if(initiate_remote_stonith_op(client, request, FALSE) != NULL) { rc = -EINPROGRESS; } } } else if (crm_str_eq(op, STONITH_OP_FENCE_HISTORY, TRUE)) { rc = stonith_fence_history(request, &data); - always_reply = TRUE; } else if(crm_str_eq(op, STONITH_OP_DEVICE_ADD, TRUE)) { const char *id = NULL; xmlNode *notify_data = create_xml_node(NULL, op); rc = stonith_device_register(request, &id, FALSE); crm_xml_add(notify_data, F_STONITH_DEVICE, id); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(device_list)); do_stonith_notify(call_options, op, rc, notify_data); free_xml(notify_data); } else if(crm_str_eq(op, STONITH_OP_DEVICE_DEL, TRUE)) { xmlNode *dev = get_xpath_object("//"F_STONITH_DEVICE, request, LOG_ERR); const char *id = crm_element_value(dev, XML_ATTR_ID); xmlNode *notify_data = create_xml_node(NULL, op); rc = stonith_device_remove(id, FALSE); crm_xml_add(notify_data, F_STONITH_DEVICE, id); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(device_list)); do_stonith_notify(call_options, op, rc, notify_data); free_xml(notify_data); } else if(crm_str_eq(op, STONITH_OP_LEVEL_ADD, TRUE)) { char *id = NULL; xmlNode *notify_data = create_xml_node(NULL, op); rc = stonith_level_register(request, &id); crm_xml_add(notify_data, F_STONITH_DEVICE, id); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(topology)); do_stonith_notify(call_options, op, rc, notify_data); free_xml(notify_data); } else if(crm_str_eq(op, STONITH_OP_LEVEL_DEL, TRUE)) { char *id = NULL; xmlNode *notify_data = create_xml_node(NULL, op); rc = stonith_level_remove(request, &id); crm_xml_add(notify_data, F_STONITH_DEVICE, id); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(topology)); do_stonith_notify(call_options, op, rc, notify_data); free_xml(notify_data); } else if(crm_str_eq(op, STONITH_OP_CONFIRM, TRUE)) { async_command_t *cmd = create_async_command(request); xmlNode *reply = stonith_construct_async_reply(cmd, NULL, NULL, 0); crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY); crm_notice("Broadcasting manual fencing confirmation for node %s", cmd->victim); send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE); free_async_command(cmd); free_xml(reply); } else { crm_err("Unknown %s from %s", op, client ? client->name : remote_peer); crm_log_xml_warn(request, "UnknownOp"); } done: - if (rc == -EINPROGRESS) { - /* Nothing (yet) */ - } else if(remote_peer) { + /* Always reply unles the request is in process still. + * If in progress, a reply will happen async after the request + * processing is finished */ + if (rc != -EINPROGRESS) { reply = stonith_construct_reply(request, output, data, rc); - send_cluster_message(crm_get_peer(0, remote_peer), crm_msg_stonith_ng, reply, FALSE); - free_xml(reply); - - } else if(rc <= pcmk_ok || always_reply) { - reply = stonith_construct_reply(request, output, data, rc); - do_local_reply(reply, client_id, call_options & st_opt_sync_call, remote_peer!=NULL); - free_xml(reply); + stonith_send_reply(reply, call_options, remote_peer, client_id); } free(output); free_xml(data); + free_xml(reply); return rc; } static void handle_reply(stonith_client_t *client, xmlNode *request, const char *remote_peer) { const char *op = crm_element_value(request, F_STONITH_OPERATION); if(crm_str_eq(op, STONITH_OP_QUERY, TRUE)) { process_remote_stonith_query(request); } else if(crm_str_eq(op, T_STONITH_NOTIFY, TRUE)) { process_remote_stonith_exec(request); } else if(crm_str_eq(op, STONITH_OP_FENCE, TRUE)) { /* Reply to a complex fencing op */ process_remote_stonith_exec(request); } else { crm_err("Unknown %s reply from %s", op, client ? client->name : remote_peer); crm_log_xml_warn(request, "UnknownOp"); } } void stonith_command(stonith_client_t *client, uint32_t id, uint32_t flags, xmlNode *request, const char *remote_peer) { int call_options = 0; int rc = 0; gboolean is_reply = FALSE; const char *op = crm_element_value(request, F_STONITH_OPERATION); if(get_xpath_object("//"T_STONITH_REPLY, request, LOG_DEBUG_3)) { is_reply = TRUE; } crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); crm_debug("Processing %s%s from %s (%16x)", op, is_reply?" reply":"", client?client->name:remote_peer, call_options); if(is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(client == NULL || client->request_id == id); } if (is_reply) { handle_reply(client, request, remote_peer); } else { rc = handle_request(client, id, flags, request, remote_peer); } do_crm_log_unlikely(rc>0?LOG_DEBUG:LOG_INFO,"Processed %s%s from %s: %s (%d)", op, is_reply?" reply":"", client?client->name:remote_peer, rc>0?"":pcmk_strerror(rc), rc); } diff --git a/fencing/fence_pcmk b/fencing/fence_pcmk index b485e474f7..25686f0227 100755 --- a/fencing/fence_pcmk +++ b/fencing/fence_pcmk @@ -1,182 +1,182 @@ #!/usr/bin/perl use Getopt::Long; my $ME = $0; END { defined fileno STDOUT or return; close STDOUT and return; warn "$ME: failed to close standard output: $!\n"; $? ||= 1; } # Get the program name from $0 and strip directory names $_=$0; s/.*\///; my $pname = $_; $opt_o = 'reset'; # Default fence action $opt_t = '300'; # Default fence timeout (seconds) $extra_args = '-E'; sub usage { - print "Helper that presents a RHCS-style interface for Linux-HA stonith plugins\n\n"; + print "Helper that presents a RHCS-style interface to stonith-ng for CMAN based clusters\n\n"; print "Should never need to use invoked by the user directly\n\n"; print "\n"; print "Usage: $pname [options]\n"; print "\n"; print "Options:\n"; print " -h usage\n"; print " -n nodename\n"; print " -o Action: on | off | reset (default) | stat | hostlist\n"; print " -q quiet mode\n"; print " -V version\n"; exit 0; } sub fail { ($msg) = @_; print $msg."\n" unless defined $opt_q; $t->close if defined $t; exit 1; } sub fail_usage { ($msg)=@_; print STDERR $msg."\n" if $msg; print STDERR "Please use '-h' for usage.\n"; exit 1; } sub version { print "1.0.0\n"; exit 0; } sub get_options_stdin { my $opt; my $line = 0; while( defined($in = <>) ) { $_ = $in; chomp; # strip leading and trailing whitespace s/^\s*//; s/\s*$//; # skip comments next if /^#/; $line+=1; $opt=$_; next unless $opt; ($name,$val)=split /\s*=\s*/, $opt; if ( $name eq "" ) { print STDERR "parse error: illegal name in option $line\n"; exit 2; } # DO NOTHING -- this field is used by fenced elsif ($name eq "agent" ) {} elsif ($name eq "timeout" ) { $opt_t = $val; } elsif ($name eq "option" || $name eq "action" ) { $opt_o = $val; } elsif ($name eq "port" ) { $opt_n = $val; } else { $ENV{$name} = $val; } } } ######################################################################33 # MAIN if (@ARGV > 0) { GetOptions("n=s"=>\$opt_n, "o=s"=>\$opt_o, "t=s"=>\$opt_t, "q" =>\$opt_q, "V" =>\$opt_V, "version" =>\$opt_V, "help" =>\$opt_h, "h" =>\$opt_h) || fail_usage; foreach (@ARGV) { print "$_\n"; } # getopts("ht:n:o:s:qV") || fail_usage ; usage if defined $opt_h; version if defined $opt_V; fail_usage "Unknown parameter." if (@ARGV > 0); } else { get_options_stdin(); } $opt_o=lc($opt_o); fail "failed: unrecognised action: $opt_o" unless $opt_o =~ /^(on|off|reset|reboot|stat|status|monitor|list|hostlist|poweroff|poweron)$/; sub term_handler { local $SIG{TERM} = 'IGNORE'; kill(TERM,-getpgrp($$)); } setpgrp($$,0); $agent_pid=$$; if ( $pid=fork() == 0 ) { $cmd="--help $opt_o"; if ( $opt_o eq "monitor" || $opt_o eq "stat" || $opt_o eq "status" || $opt_o eq "list" ) { $cmd="--list"; $cmd="--list-registered" unless defined $opt_n; } else { fail "failed: no plug number" unless defined $opt_n; if ( $opt_o eq "reboot" || $opt_o eq "reset" ) { $cmd="--reboot"; } elsif ( $opt_o eq "on" || $opt_o eq "poweron" ) { $cmd="--unfence"; } elsif ( $opt_o eq "off" || $opt_o eq "poweroff" ) { $cmd="--fence"; } system "logger -p daemon.notice \"fence_pcmk[$agent_pid]: Requesting Pacemaker fence $opt_n ($opt_o)\""; } exec "stonith_admin $cmd $opt_n --tolerance 5s" or die "failed to exec \"stonith_admin $cmd $opt_n\"\n"; } $SIG{TERM} = \&term_handler; wait; $status=$?/256; print (($status == 0 ? "success":"failed") . ": $opt_n $status\n") unless defined $opt_q; if ( $status != 0 ) { system "logger -p daemon.notice \"fence_pcmk[$agent_pid]: Call to fence $opt_n ($opt_o) failed with rc=$status\""; } exit ($status == 0 ? 0 : 1 ); diff --git a/fencing/internal.h b/fencing/internal.h index 670124915e..52ce19bd6a 100644 --- a/fencing/internal.h +++ b/fencing/internal.h @@ -1,170 +1,170 @@ #include /*! * \internal * \brief Check to see if target was fenced in the last few seconds. * \param tolerance, The number of seconds to look back in time * \param target, The node to search for * \param action, The action we want to match. * * \retval FALSE, not match * \retval TRUE, fencing operation took place in the last 'tolerance' number of seconds. */ gboolean stonith_check_fence_tolerance(int tolerance, const char *target, const char *action); typedef struct stonith_device_s { char *id; char *agent; char *namespace; /*! list of actions that must execute on the target node. Used for unfencing*/ char *on_target_actions; GListPtr targets; time_t targets_age; gboolean has_attr_map; guint priority; guint active_pid; GHashTable *params; GHashTable *aliases; GList *pending_ops; crm_trigger_t *work; /*! A verified device is one that has contacted the * agent successfully to perform a monitor operation */ gboolean verified; gboolean cib_registered; gboolean api_registered; } stonith_device_t; typedef struct stonith_client_s { char *id; char *name; int pid; int request_id; char *channel_name; qb_ipcs_connection_t *channel; long long flags; } stonith_client_t; typedef struct remote_fencing_op_s { /* The unique id associated with this operation */ char *id; /*! The node this operation will fence */ char *target; /*! The fencing action to perform on the target. (reboot, on, off) */ char *action; /*! Marks if the final notifications have been sent to local stonith clients. */ gboolean notify_sent; /*! The number of query replies received */ guint replies; /*! Does this node own control of this operation */ gboolean owner; /*! After query is complete, This the high level timer that expires the entire operation */ guint op_timer_total; /*! This timer expires the current fencing request. Many fencing * requests may exist in a single operation */ guint op_timer_one; /*! This timer expires the query request sent out to determine * what nodes are contain what devices, and who those devices can fence */ guint query_timer; /*! This is the default timeout to use for each fencing device if no * custom timeout is received in the query. */ gint base_timeout; /*! This is the calculated total timeout an operation can take before * expiring. This is calculated by adding together all the timeout * values associated with the devices this fencing operation may call */ gint total_timeout; /*! Delegate is the node being asked to perform a fencing action * on behalf of the node that owns the remote operation. Some operations * will involve multiple delegates. This value represents the final delegate * that is used. */ char *delegate; /*! The point at which the remote operation completed */ time_t completed; /*! The stonith_call_options associated with this remote operation */ long long call_options; /*! The current state of the remote operation. This indicates * what phase the op is in, query, exec, done, duplicate, failed. */ enum op_state state; /*! The node that owns the remote operation */ char *originator; /*! The local client id that initiated the fencing request */ char *client_id; /*! The name of client that initiated the fencing request */ char *client_name; /*! List of the received query results for all the nodes in the cpg group */ GListPtr query_results; /*! The original request that initiated the remote stonith operation */ xmlNode *request; /*! The current topology level being executed */ guint level; /*! The device list of all the devices at the current executing topology level. */ GListPtr devices; /*! List of duplicate operations attached to this operation. Once this operation * completes, the duplicate operations will be closed out as well. */ GListPtr duplicates; } remote_fencing_op_t; typedef struct stonith_topology_s { char *node; GListPtr levels[ST_LEVEL_MAX]; } stonith_topology_t; extern long long get_stonith_flag(const char *name); extern void stonith_command(stonith_client_t * client, uint32_t id, uint32_t flags, xmlNode * op_request, const char *remote_peer); extern int stonith_device_register(xmlNode * msg, const char **desc, gboolean from_cib); extern int stonith_device_remove(const char *id, gboolean from_cib); extern int stonith_level_register(xmlNode * msg, char **desc); extern int stonith_level_remove(xmlNode * msg, char **desc); extern void do_local_reply(xmlNode * notify_src, const char *client_id, gboolean sync_reply, gboolean from_peer); -extern xmlNode *stonith_construct_reply(xmlNode * request, char *output, xmlNode * data, int rc); +extern xmlNode *stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, int rc); void do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout); extern void do_stonith_notify(int options, const char *type, int result, xmlNode * data); extern remote_fencing_op_t *initiate_remote_stonith_op(stonith_client_t * client, xmlNode * request, gboolean manual_ack); extern int process_remote_stonith_exec(xmlNode * msg); extern int process_remote_stonith_query(xmlNode * msg); extern void *create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer); extern int stonith_fence_history(xmlNode * msg, xmlNode ** output); extern void free_device(gpointer data); extern void free_topology_entry(gpointer data); extern char *stonith_our_uname; extern gboolean stand_alone; extern GHashTable *device_list; extern GHashTable *topology; extern GHashTable *client_list; diff --git a/fencing/regression.py.in b/fencing/regression.py.in index 9463558e58..b312699acf 100644 --- a/fencing/regression.py.in +++ b/fencing/regression.py.in @@ -1,923 +1,985 @@ #!/usr/bin/python # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. import os import sys import subprocess import shlex import time def output_from_command(command): test = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE) test.wait() return test.communicate()[0].split("\n") class Test: def __init__(self, name, description, verbose = 0, with_cpg = 0): self.name = name self.description = description self.cmds = [] self.verbose = verbose self.result_txt = "" self.cmd_tool_output = "" self.result_exitcode = 0; self.stonith_options = "-s" self.enable_corosync = 0 if with_cpg: self.stonith_options = "-c" self.enable_corosync = 1 self.stonith_process = None self.stonith_output = "" self.stonith_patterns = [] self.negative_stonith_patterns = [] self.executed = 0 rsc_classes = output_from_command("crm_resource --list-standards") def __new_cmd(self, cmd, args, exitcode, stdout_match = "", no_wait = 0, stdout_negative_match = "", kill=None): self.cmds.append( { "cmd" : cmd, "kill" : kill, "args" : args, "expected_exitcode" : exitcode, "stdout_match" : stdout_match, "stdout_negative_match" : stdout_negative_match, "no_wait" : no_wait, } ) def stop_pacemaker(self): cmd = shlex.split("killall -9 -q pacemakerd") test = subprocess.Popen(cmd, stdout=subprocess.PIPE) test.wait() def start_environment(self): ### make sure we are in full control here ### self.stop_pacemaker() cmd = shlex.split("killall -9 -q stonithd") test = subprocess.Popen(cmd, stdout=subprocess.PIPE) test.wait() if self.verbose: print "Starting stonithd with %s" % self.stonith_options self.stonith_process = subprocess.Popen( shlex.split("@CRM_DAEMON_DIR@/stonithd %s -V" % self.stonith_options), stdout=subprocess.PIPE, stderr=subprocess.PIPE) time.sleep(1) def clean_environment(self): if self.stonith_process: self.stonith_process.terminate() self.stonith_output = self.stonith_process.communicate()[1] self.stonith_process = None if self.verbose: print self.stonith_output def add_stonith_log_pattern(self, pattern): self.stonith_patterns.append(pattern) def add_stonith_negative_log_pattern(self, pattern): self.negative_stonith_patterns.append(pattern) def add_cmd(self, cmd, args): self.__new_cmd(cmd, args, 0, "") def add_cmd_no_wait(self, cmd, args): self.__new_cmd(cmd, args, 0, "", 1) def add_cmd_check_stdout(self, cmd, args, match, no_match = ""): self.__new_cmd(cmd, args, 0, match, 0, no_match) def add_expected_fail_cmd(self, cmd, args, exitcode = 255): self.__new_cmd(cmd, args, exitcode, "") def get_exitcode(self): return self.result_exitcode def print_result(self, filler): print "%s%s" % (filler, self.result_txt) def run_cmd(self, args): cmd = shlex.split(args['args']) cmd.insert(0, args['cmd']) if self.verbose: print "\n\nRunning: "+" ".join(cmd) test = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if args['kill']: if self.verbose: print "Also running: "+args['kill'] subprocess.Popen(shlex.split(args['kill'])) if args['no_wait'] == 0: test.wait() else: return 0 - output = test.communicate()[0] + output_res = test.communicate() + output = output_res[0] + output_res[1] + if self.verbose: print output if args['stdout_match'] != "" and output.count(args['stdout_match']) == 0: test.returncode = -2 print "STDOUT string '%s' was not found in cmd output: %s" % (args['stdout_match'], output) if args['stdout_negative_match'] != "" and output.count(args['stdout_negative_match']) != 0: test.returncode = -2 print "STDOUT string '%s' was found in cmd output: %s" % (args['stdout_negative_match'], output) return test.returncode; def count_negative_matches(self, outline): count = 0 for line in self.negative_stonith_patterns: if outline.count(line): count = 1 if self.verbose: print "This pattern should not have matched = '%s" % (line) return count def match_stonith_patterns(self): negative_matches = 0 cur = 0 pats = self.stonith_patterns total_patterns = len(self.stonith_patterns) if len(self.stonith_patterns) == 0: return for line in self.stonith_output.split("\n"): negative_matches = negative_matches + self.count_negative_matches(line) if len(pats) == 0: continue cur = -1 for p in pats: cur = cur + 1 if line.count(pats[cur]): del pats[cur] break if len(pats) > 0 or negative_matches: if self.verbose: for p in pats: print "Pattern Not Matched = '%s'" % p self.result_txt = "FAILURE - '%s' failed. %d patterns out of %d not matched. %d negative matches." % (self.name, len(pats), total_patterns, negative_matches) self.result_exitcode = -1 def run(self): res = 0 i = 1 self.start_environment() if self.verbose: print "\n--- START TEST - %s" % self.name self.result_txt = "SUCCESS - '%s'" % (self.name) self.result_exitcode = 0 for cmd in self.cmds: res = self.run_cmd(cmd) if res != cmd['expected_exitcode']: print "Step %d FAILED - command returned %d, expected %d" % (i, res, cmd['expected_exitcode']) - self.result_txt = "FAILURE - '%s' failed at step %d. Command: lrmd_test %s" % (self.name, i, cmd['args']) + self.result_txt = "FAILURE - '%s' failed at step %d. Command: %s %s" % (self.name, i, cmd['cmd'], cmd['args']) self.result_exitcode = -1 break else: if self.verbose: print "Step %d SUCCESS" % (i) i = i + 1 self.clean_environment() if self.result_exitcode == 0: self.match_stonith_patterns() print self.result_txt if self.verbose: print "--- END TEST - %s\n" % self.name self.executed = 1 return res class Tests: def __init__(self, verbose = 0): self.tests = [] self.verbose = verbose self.autogen_corosync_cfg = 0 if not os.path.exists("/etc/corosync/corosync.conf"): self.autogen_corosync_cfg = 1 def new_test(self, name, description, with_cpg = 0): test = Test(name, description, self.verbose, with_cpg) self.tests.append(test) return test def print_list(self): print "\n==== %d TESTS FOUND ====" % (len(self.tests)) print "%35s - %s" % ("TEST NAME", "TEST DESCRIPTION") print "%35s - %s" % ("--------------------", "--------------------") for test in self.tests: print "%35s - %s" % (test.name, test.description) print "==== END OF LIST ====\n" def start_corosync(self): if self.verbose: print "Starting corosync" test = subprocess.Popen("corosync", stdout=subprocess.PIPE) test.wait() time.sleep(10) def stop_corosync(self): cmd = shlex.split("killall -9 -q corosync") test = subprocess.Popen(cmd, stdout=subprocess.PIPE) test.wait() def run_single(self, name): for test in self.tests: if test.name == name: test.run() break; def run_tests_matching(self, pattern): for test in self.tests: if test.name.count(pattern) != 0: test.run() def run_cpg_only(self): for test in self.tests: if test.enable_corosync: test.run() def run_no_cpg(self): for test in self.tests: if not test.enable_corosync: test.run() def run_tests(self): for test in self.tests: test.run() def exit(self): for test in self.tests: if test.executed == 0: continue if test.get_exitcode() != 0: sys.exit(-1) sys.exit(0) def print_results(self): failures = 0; success = 0; print "\n\n======= FINAL RESULTS ==========" print "\n--- FAILURE RESULTS:" for test in self.tests: if test.executed == 0: continue if test.get_exitcode() != 0: failures = failures + 1 test.print_result(" ") else: success = success + 1 if failures == 0: print " None" print "\n--- TOTALS\n Pass:%d\n Fail:%d\n" % (success, failures) def build_api_sanity_tests(self): verbose_arg = "" if self.verbose: verbose_arg = "-V" test = self.new_test("standalone_low_level_api_test", "Sanity test client api in standalone mode.") test.add_cmd("@CRM_DAEMON_DIR@/stonith-test", "-t %s" % (verbose_arg)) test = self.new_test("cpg_low_level_api_test", "Sanity test client api using mainloop and cpg.", 1) test.add_cmd("@CRM_DAEMON_DIR@/stonith-test", "-m %s" % (verbose_arg)) def build_custom_timeout_tests(self): # custom timeout without topology test = self.new_test("cpg_custom_timeout_1", "Verify per device timeouts work as expected without using topology.", 1) test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=1\"") test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=4\"") test.add_cmd("stonith_admin", "-F node3 -t 2") # timeout is 2+1+4 = 7 test.add_stonith_log_pattern("remote op timeout set to 7") # custom timeout _WITH_ topology test = self.new_test("cpg_custom_timeout_2", "Verify per device timeouts work as expected _WITH_ topology.", 1) test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=1\"") test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=4000\"") test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1") test.add_cmd("stonith_admin", "-r node3 -i 3 -v false2") test.add_cmd("stonith_admin", "-F node3 -t 2") # timeout is 2+1+4000 = 4003 test.add_stonith_log_pattern("remote op timeout set to 4003") def build_fence_merge_tests(self): ### Simple test that overlapping fencing operations get merged test = self.new_test("cpg_custom_merge_single", "Verify overlapping identical fencing operations are merged, no fencing levels used.", 1) test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\" ") test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node3\"") test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") test.add_cmd("stonith_admin", "-F node3 -t 10") ### one merger will happen test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") ### the pattern below signifies that both the original and duplicate operation completed test.add_stonith_log_pattern("Operation off of node3 by") test.add_stonith_log_pattern("Operation off of node3 by") ### Test that multiple mergers occur test = self.new_test("cpg_custom_merge_multiple", "Verify multiple overlapping identical fencing operations are merged", 1) test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\" ") test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node3\"") test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") test.add_cmd("stonith_admin", "-F node3 -t 10") ### 4 mergers should occur test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") ### the pattern below signifies that both the original and duplicate operation completed test.add_stonith_log_pattern("Operation off of node3 by") test.add_stonith_log_pattern("Operation off of node3 by") test.add_stonith_log_pattern("Operation off of node3 by") test.add_stonith_log_pattern("Operation off of node3 by") test.add_stonith_log_pattern("Operation off of node3 by") ### Test that multiple mergers occur with topologies used test = self.new_test("cpg_custom_merge_with_topology", "Verify multiple overlapping identical fencing operations are merged with fencing levels.", 1) test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\" ") test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "-r node3 -i 1 -v false2") test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1") test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10") test.add_cmd("stonith_admin", "-F node3 -t 10") ### 4 mergers should occur test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client") ### the pattern below signifies that both the original and duplicate operation completed test.add_stonith_log_pattern("Operation off of node3 by") test.add_stonith_log_pattern("Operation off of node3 by") test.add_stonith_log_pattern("Operation off of node3 by") test.add_stonith_log_pattern("Operation off of node3 by") test.add_stonith_log_pattern("Operation off of node3 by") test = self.new_test("cpg_custom_no_merge", "Verify differing fencing operations are not merged", 1) test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node3 node2\"") test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3 node2\" ") test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node3 node2\"") test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "-r node3 -i 1 -v false2") test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1") test.add_cmd_no_wait("stonith_admin", "-F node2 -t 10") test.add_cmd("stonith_admin", "-F node3 -t 10") test.add_stonith_negative_log_pattern("Merging stonith action off for node node3 originating from client") def build_standalone_tests(self): test_types = [ { "prefix" : "standalone" , "use_cpg" : 0, }, { "prefix" : "cpg" , "use_cpg" : 1, }, ] # test what happens when all devices timeout for test_type in test_types: test = self.new_test("%s_fence_multi_device_failure" % test_type["prefix"], "Verify that all devices timeout, a fencing failure is returned.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R false3 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_expected_fail_cmd("stonith_admin", "-F node3 -t 2", 194) if test_type["use_cpg"] == 1: test.add_stonith_log_pattern("remote op timeout set to 6") test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -62") test.add_stonith_log_pattern("for host 'node3' with device 'false2' returned: -62") test.add_stonith_log_pattern("for host 'node3' with device 'false3' returned: -62") # test what happens when multiple devices can fence a node, but the first device fails. for test_type in test_types: test = self.new_test("%s_fence_device_failure_rollover" % test_type["prefix"], "Verify that when one fence device fails for a node, the others are tried.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-F node3 -t 2") if test_type["use_cpg"] == 1: test.add_stonith_log_pattern("remote op timeout set to 6") # simple topology test for one device for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_simple" % test_type["prefix"], "Verify all fencing devices at a level are used.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "-R true -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-r node3 -i 1 -v true") test.add_cmd("stonith_admin", "-F node3 -t 2") test.add_stonith_log_pattern("remote op timeout set to 2") test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0") # test what happens when the first fencing level has multiple devices. for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_device_fails" % test_type["prefix"], "Verify if one device in a level fails, the other is tried.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "-R false -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-r node3 -i 1 -v false") test.add_cmd("stonith_admin", "-r node3 -i 2 -v true") test.add_cmd("stonith_admin", "-F node3 -t 20") test.add_stonith_log_pattern("remote op timeout set to 4") test.add_stonith_log_pattern("for host 'node3' with device 'false' returned: -62") test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0") # test what happens when the first fencing level fails. for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_multi_level_fails" % test_type["prefix"], "Verify if one level fails, the next leve is tried.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true2 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true3 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true4 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1") test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2") test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2") test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3") test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4") test.add_cmd("stonith_admin", "-F node3 -t 2") test.add_stonith_log_pattern("remote op timeout set to 12") test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -62") test.add_stonith_log_pattern("for host 'node3' with device 'false2' returned: -62") test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0") test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0") # test what happens when the first fencing level had devices that no one has registered for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_missing_devices" % test_type["prefix"], "Verify topology can continue with missing devices.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "-R true2 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true3 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true4 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1") test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2") test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2") test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3") test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4") test.add_cmd("stonith_admin", "-F node3 -t 2") # Test what happens if multiple fencing levels are defined, and then the first one is removed. for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_topology_level_removal" % test_type["prefix"], "Verify level removal works.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true2 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true3 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true4 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1") test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1") test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2") test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2") test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3") test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4") # Now remove level 2, verify none of the devices in level two are hit. test.add_cmd("stonith_admin", "-d node3 -i 2") test.add_cmd("stonith_admin", "-F node3 -t 20") test.add_stonith_log_pattern("remote op timeout set to 8") test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -62") test.add_stonith_negative_log_pattern("for host 'node3' with device 'false2' returned: -62") test.add_stonith_negative_log_pattern("for host 'node3' with device 'false2' returned: -1001") test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0") test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0") # test the stonith builds the correct list of devices that can fence a node. for test_type in test_types: test = self.new_test("%s_list_devices" % test_type["prefix"], "Verify list of devices that can fence a node is correct", test_type["use_cpg"]) test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "-R true2 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd("stonith_admin", "-R true3 -a fence_true -o \"pcmk_host_list=node1 node2 node3\"") test.add_cmd_check_stdout("stonith_admin", "-l node1 -V", "true2", "true1") test.add_cmd_check_stdout("stonith_admin", "-l node1 -V", "true3", "true1") # simple test of device monitor for test_type in test_types: test = self.new_test("%s_monitor" % test_type["prefix"], "Verify device is reachable", test_type["use_cpg"]) test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "-Q true1") test.add_cmd("stonith_admin", "-Q false1") test.add_expected_fail_cmd("stonith_admin", "-Q true2", 237) + # Verify monitor occurs for duration of timeout period on failure + for test_type in test_types: + test = self.new_test("%s_monitor_timeout" % test_type["prefix"], + "Verify monitor uses duration of timeout period given.", test_type["use_cpg"]) + test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_monitor_fail -o \"pcmk_host_list=node3\"") + test.add_expected_fail_cmd("stonith_admin", "-Q true1 -t 5", 23) + test.add_stonith_log_pattern("Attempt 2 to execute") + + # Verify monitor occurs for duration of timeout period on failure, but stops at max retries + for test_type in test_types: + test = self.new_test("%s_monitor_timeout_max_retries" % test_type["prefix"], + "Verify monitor retries until max retry value or timeout is hit.", test_type["use_cpg"]) + test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_monitor_fail -o \"pcmk_host_list=node3\"") + test.add_expected_fail_cmd("stonith_admin", "-Q true1 -t 15", 23) + test.add_stonith_log_pattern("Attempt 10 to execute") + # simple register test for test_type in test_types: test = self.new_test("%s_register" % test_type["prefix"], "Verify devices can be registered and un-registered", test_type["use_cpg"]) test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "-Q true1") test.add_cmd("stonith_admin", "-D true1") test.add_expected_fail_cmd("stonith_admin", "-Q true1", 237) # simple reboot test for test_type in test_types: test = self.new_test("%s_reboot" % test_type["prefix"], "Verify devices can be rebooted", test_type["use_cpg"]) test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "-B node3 -t 2") test.add_cmd("stonith_admin", "-D true1") test.add_expected_fail_cmd("stonith_admin", "-Q true1", 237) # test fencing history. for test_type in test_types: if test_type["use_cpg"] == 0: continue test = self.new_test("%s_fence_history" % test_type["prefix"], "Verify last fencing operation is returned.", test_type["use_cpg"]) test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "-F node3 -t 2 -V") test.add_cmd_check_stdout("stonith_admin", "-H node3", "was able to turn off node node3", "") + # simple test of dynamic list query + for test_type in test_types: + test = self.new_test("%s_dynamic_list_query" % test_type["prefix"], + "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"]) + test.add_cmd("stonith_admin", "-R true1 -a fence_true") + test.add_cmd("stonith_admin", "-R true2 -a fence_true") + test.add_cmd("stonith_admin", "-R true3 -a fence_true") + + test.add_cmd_check_stdout("stonith_admin", "-l fake_port_1", "3 devices found") + + + # fence using dynamic list query + for test_type in test_types: + test = self.new_test("%s_fence_dynamic_list_query" % test_type["prefix"], + "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"]) + test.add_cmd("stonith_admin", "-R true1 -a fence_true") + test.add_cmd("stonith_admin", "-R true2 -a fence_true") + test.add_cmd("stonith_admin", "-R true3 -a fence_true") + + test.add_cmd("stonith_admin", "-F fake_port_1 -t 5 -V"); + + # simple test of query using status action + for test_type in test_types: + test = self.new_test("%s_status_query" % test_type["prefix"], + "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"]) + test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_check=status\"") + test.add_cmd("stonith_admin", "-R true2 -a fence_true -o \"pcmk_host_check=status\"") + test.add_cmd("stonith_admin", "-R true3 -a fence_true -o \"pcmk_host_check=status\"") + + test.add_cmd_check_stdout("stonith_admin", "-l fake_port_1", "3 devices found") + def build_unfence_tests(self): our_uname = output_from_command("uname -n") if our_uname: our_uname = our_uname[0] ### Simple test unfencing works test = self.new_test("cpg_unfence_simple", "Verify simple unfencing.", 1) test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "-R true1 -a fence_true -o \"pcmk_host_list=node3\" ") test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=node3\"") test.add_cmd("stonith_admin", "-U node3 -t 3") ### verify unfencing using on_target device test = self.new_test("cpg_unfence_on_target_1", "Verify unfencing with on_target = true", 1) test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_on_target -o \"pcmk_host_list=%s\"" % (our_uname)) test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname)) test.add_stonith_log_pattern("(on) to be executed on the target node") ### verify failure of unfencing using on_target device test = self.new_test("cpg_unfence_on_target_2", "Verify failure unfencing with on_target = true", 1) test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_on_target -o \"pcmk_host_list=%s node_fake_1234\"" % (our_uname)) test.add_expected_fail_cmd("stonith_admin", "-U node_fake_1234 -t 3", 194) test.add_stonith_log_pattern("(on) to be executed on the target node") ### verify unfencing using on_target device with topology test = self.new_test("cpg_unfence_on_target_3", "Verify unfencing with on_target = true using topology", 1) test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=%s node3\"" % (our_uname)) test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=%s node3\"" % (our_uname)) test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_on_target -o \"pcmk_host_list=%s node3\"" % (our_uname)) test.add_cmd("stonith_admin", "-r %s -i 1 -v false1" % (our_uname)) test.add_cmd("stonith_admin", "-r %s -i 2 -v false2" % (our_uname)) test.add_cmd("stonith_admin", "-r %s -i 3 -v true1" % (our_uname)) test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname)) test.add_stonith_log_pattern("(on) to be executed on the target node") ### verify unfencing using on_target device with topology fails test = self.new_test("cpg_unfence_on_target_4", "Verify unfencing failure with on_target = true using topology", 1) test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=%s node_fake\"" % (our_uname)) test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=%s node_fake\"" % (our_uname)) test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_on_target -o \"pcmk_host_list=%s node_fake\"" % (our_uname)) test.add_cmd("stonith_admin", "-r node_fake -i 1 -v false1") test.add_cmd("stonith_admin", "-r node_fake -i 2 -v false2") test.add_cmd("stonith_admin", "-r node_fake -i 3 -v true1") test.add_expected_fail_cmd("stonith_admin", "-U node_fake -t 3", 194) test.add_stonith_log_pattern("(on) to be executed on the target node") ### verify use of on_target = true for "on" action does not interfere with "off" action test = self.new_test("cpg_unfence_on_target_ignored", "Verify on target is ignored for other actions", 1) test.add_cmd("stonith_admin", "-R false1 -a fence_false -o \"pcmk_host_list=%s node_fake\"" % (our_uname)) test.add_cmd("stonith_admin", "-R false2 -a fence_false -o \"pcmk_host_list=%s node_fake\"" % (our_uname)) test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_on_target -o \"pcmk_host_list=%s node_fake\"" % (our_uname)) test.add_cmd("stonith_admin", "-r node_fake -i 1 -v false1") test.add_cmd("stonith_admin", "-r node_fake -i 2 -v false2") test.add_cmd("stonith_admin", "-r node_fake -i 3 -v true1") test.add_cmd("stonith_admin", "-F node_fake -t 3") test.add_stonith_log_pattern("(on) to be executed on the target node") def setup_environment(self, use_corosync): if self.autogen_corosync_cfg and use_corosync: corosync_conf = (""" totem { version: 2 crypto_cipher: none crypto_hash: none nodeid: 101 secauth: off interface { ttl: 1 ringnumber: 0 mcastport: 6666 mcastaddr: 226.94.1.1 bindnetaddr: 127.0.0.1 } } logging { debug: off fileline: off to_syslog: no to_stderr: no syslog_facility: daemon timestamp: on to_logfile: yes logfile: /var/log/corosync.log logfile_priority: info } """) os.system("cat <<-END >>/etc/corosync/corosync.conf\n%s\nEND" % (corosync_conf)) if use_corosync: ### make sure we are in control ### self.stop_corosync() self.start_corosync() - + monitor_fail_agent = ("""#!/usr/bin/python +import sys +def main(): + for line in sys.stdin.readlines(): + if line.count("monitor") > 0: + sys.exit(-1); + sys.exit(-1) +if __name__ == "__main__": + main() +""") on_target_agent = ("""#!/usr/bin/python import sys def main(): for line in sys.stdin.readlines(): if line.count("monitor") > 0: sys.exit(0) if line.count("metadata") > 0: print '' print ' dummy description.' print ' http://www.example.com' print ' ' print ' ' print ' ' print ' ' print ' Fencing Action' print ' ' print ' ' print ' ' print ' ' print ' Physical plug number or name of virtual machine' print ' ' print ' ' print ' ' print ' ' print ' ' print ' ' print ' ' print ' ' print '' sys.exit(0) if line.count("on") > 0: sys.exit(0) sys.exit(-1) if __name__ == "__main__": main() """) os.system("cat <<-END >>/usr/sbin/fence_dummy_on_target\n%s\nEND" % (on_target_agent)) os.system("chmod 711 /usr/sbin/fence_dummy_on_target") + + os.system("cat <<-END >>/usr/sbin/fence_dummy_monitor_fail\n%s\nEND" % (monitor_fail_agent)) + os.system("chmod 711 /usr/sbin/fence_dummy_monitor_fail") os.system("cp /usr/share/pacemaker/tests/cts/fence_false /usr/sbin/fence_false") os.system("cp /usr/share/pacemaker/tests/cts/fence_true /usr/sbin/fence_true") def cleanup_environment(self, use_corosync): if use_corosync: self.stop_corosync() if self.verbose and os.path.exists('/var/log/corosync.log'): print "Daemon output" f = open('/var/log/corosync.log', 'r') for line in f.readlines(): print line.strip() os.remove('/var/log/corosync.log') if self.autogen_corosync_cfg: os.system("rm -f /etc/corosync/corosync.conf") os.system("rm -f /usr/sbin/fence_dummy_on_target") + os.system("rm -f /usr/sbin/fence_dummy_monitor_fail") class TestOptions: def __init__(self): self.options = {} self.options['list-tests'] = 0 self.options['run-all'] = 1 self.options['run-only'] = "" self.options['run-only-pattern'] = "" self.options['verbose'] = 0 self.options['invalid-arg'] = "" self.options['cpg-only'] = 0 self.options['no-cpg'] = 0 self.options['show-usage'] = 0 def build_options(self, argv): args = argv[1:] skip = 0 for i in range(0, len(args)): if skip: skip = 0 continue elif args[i] == "-h" or args[i] == "--help": self.options['show-usage'] = 1 elif args[i] == "-l" or args[i] == "--list-tests": self.options['list-tests'] = 1 elif args[i] == "-V" or args[i] == "--verbose": self.options['verbose'] = 1 elif args[i] == "-n" or args[i] == "--no-cpg": self.options['no-cpg'] = 1 elif args[i] == "-c" or args[i] == "--cpg-only": self.options['cpg-only'] = 1 elif args[i] == "-r" or args[i] == "--run-only": self.options['run-only'] = args[i+1] skip = 1 elif args[i] == "-p" or args[i] == "--run-only-pattern": self.options['run-only-pattern'] = args[i+1] skip = 1 def show_usage(self): print "usage: " + sys.argv[0] + " [options]" print "If no options are provided, all tests will run" print "Options:" print "\t [--help | -h] Show usage" print "\t [--list-tests | -l] Print out all registered tests." print "\t [--cpg-only | -c] Only run tests that require corosync." print "\t [--no-cpg | -n] Only run tests that do not require corosync" print "\t [--run-only | -r 'testname'] Run a specific test" print "\t [--verbose | -V] Verbose output" print "\t [--run-only-pattern | -p 'string'] Run only tests containing the string value" print "\n\tExample: Run only the test 'start_top'" print "\t\t python ./regression.py --run-only start_stop" print "\n\tExample: Run only the tests with the string 'systemd' present in them" print "\t\t python ./regression.py --run-only-pattern systemd" def main(argv): o = TestOptions() o.build_options(argv) use_corosync = 1 tests = Tests(o.options['verbose']) tests.build_standalone_tests() tests.build_custom_timeout_tests() tests.build_api_sanity_tests() tests.build_fence_merge_tests() tests.build_unfence_tests() if o.options['list-tests']: tests.print_list() sys.exit(0) elif o.options['show-usage']: o.show_usage() sys.exit(0) print "Starting ..." if o.options['no-cpg']: use_corosync = 0 tests.setup_environment(use_corosync) if o.options['run-only-pattern'] != "": tests.run_tests_matching(o.options['run-only-pattern']) tests.print_results() elif o.options['run-only'] != "": tests.run_single(o.options['run-only']) tests.print_results() elif o.options['no-cpg']: tests.run_no_cpg() tests.print_results() elif o.options['cpg-only']: tests.run_cpg_only() tests.print_results() else: tests.run_tests() tests.print_results() tests.cleanup_environment(use_corosync) tests.exit() if __name__=="__main__": main(sys.argv) diff --git a/fencing/remote.c b/fencing/remote.c index 1ebd106b9a..afb82869ee 100644 --- a/fencing/remote.c +++ b/fencing/remote.c @@ -1,1212 +1,1224 @@ /* * Copyright (C) 2009 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TIMEOUT_MULTIPLY_FACTOR 1.2 typedef struct st_query_result_s { char *host; int devices; GListPtr device_list; GHashTable *custom_action_timeouts; /* Subset of devices that peer has verified connectivity on */ GHashTable *verified_devices; } st_query_result_t; GHashTable *remote_op_list = NULL; void call_remote_stonith(remote_fencing_op_t *op, st_query_result_t *peer); static void remote_op_done(remote_fencing_op_t *op, xmlNode *data, int rc, int dup); extern xmlNode *stonith_create_op( int call_id, const char *token, const char *op, xmlNode *data, int call_options); static void report_timeout_period(remote_fencing_op_t *op, int op_timeout); static int get_op_total_timeout(remote_fencing_op_t *op, st_query_result_t *chosen_peer, int default_timeout); static void free_remote_query(gpointer data) { if(data) { st_query_result_t *query = data; crm_trace("Free'ing query result from %s", query->host); free(query->host); g_hash_table_destroy(query->custom_action_timeouts); g_hash_table_destroy(query->verified_devices); free(query); } } static void clear_remote_op_timers(remote_fencing_op_t *op) { if(op->query_timer) { g_source_remove(op->query_timer); op->query_timer = 0; } if(op->op_timer_total) { g_source_remove(op->op_timer_total); op->op_timer_total = 0; } if(op->op_timer_one) { g_source_remove(op->op_timer_one); op->op_timer_one = 0; } } static void free_remote_op(gpointer data) { remote_fencing_op_t *op = data; crm_trace("Free'ing op %s for %s", op->id, op->target); crm_log_xml_debug(op->request, "Destroying"); clear_remote_op_timers(op); free(op->id); free(op->action); free(op->target); free(op->client_id); free(op->client_name); free(op->originator); if(op->query_results) { g_list_free_full(op->query_results, free_remote_query); } if(op->request) { free_xml(op->request); op->request = NULL; } free(op); } static xmlNode * create_op_done_notify(remote_fencing_op_t *op, int rc) { xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE); crm_xml_add_int(notify_data, "state", op->state); crm_xml_add_int(notify_data, F_STONITH_RC, rc); crm_xml_add(notify_data, F_STONITH_TARGET, op->target); crm_xml_add(notify_data, F_STONITH_ACTION, op->action); crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate); crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id); crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator); crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id); crm_xml_add(notify_data, F_STONITH_CLIENTNAME,op->client_name); return notify_data; } static void bcast_result_to_peers(remote_fencing_op_t *op, int rc) { static int count = 0; xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY); xmlNode *notify_data = create_op_done_notify(op, rc); count++; crm_trace("Broadcasting result to peers"); crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY); crm_xml_add(bcast, F_SUBTYPE, "broadcast"); crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY); crm_xml_add_int(bcast, "count", count); add_message_xml(bcast, F_STONITH_CALLDATA, notify_data); send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE); free_xml(notify_data); free_xml(bcast); return; } static void handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data, int rc) { xmlNode *notify_data = NULL; xmlNode *reply = NULL; if (op->notify_sent == TRUE) { /* nothing to do */ return; } /* Do notification with a clean data object */ notify_data = create_op_done_notify(op, rc); crm_xml_add_int(data, "state", op->state); crm_xml_add(data, F_STONITH_TARGET, op->target); crm_xml_add(data, F_STONITH_OPERATION, op->action); reply = stonith_construct_reply(op->request, NULL, data, rc); crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate); /* Send fencing OP reply to local client that initiated fencing */ do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE); /* bcast to all local clients that the fencing operation happend */ do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data); /* mark this op as having notify's already sent */ op->notify_sent = TRUE; free_xml(reply); free_xml(notify_data); } static void handle_duplicates(remote_fencing_op_t *op, xmlNode *data, int rc) { GListPtr iter = NULL; for(iter = op->duplicates; iter != NULL; iter = iter->next) { remote_fencing_op_t *other = iter->data; if(other->state == st_duplicate) { /* Ie. it hasn't timed out already */ other->state = op->state; crm_debug("Peforming duplicate notification for %s@%s.%.8s = %s", other->client_name, other->originator, other->id, pcmk_strerror(rc)); remote_op_done(other, data, rc, TRUE); } else { crm_err("Skipping duplicate notification for %s@%s - %d", other->client_name, other->originator, other->state); } } } /*! * \internal * \brief Finalize a remote operation. * * \description This function has two code paths. * * Path 1. This node is the owner of the operation and needs * to notify the cpg group via a broadcast as to the operation's * results. * * Path 2. The cpg broadcast is received. All nodes notify their local * stonith clients the operation results. * * So, The owner of the operation first notifies the cluster of the result, * and once that cpg notify is received back it notifies all the local clients. * * Nodes that are passive watchers of the operation will receive the * broadcast and only need to notify their local clients the operation finished. * * \param op, The fencing operation to finalize * \param data, The xml msg reply (if present) of the last delegated fencing * operation. * \param dup, Is this operation a duplicate, if so treat it a little differently * making sure the broadcast is not sent out. */ static void remote_op_done(remote_fencing_op_t *op, xmlNode *data, int rc, int dup) { int level = LOG_ERR; const char *subt = NULL; xmlNode *local_data = NULL; op->completed = time(NULL); clear_remote_op_timers(op); if(op->notify_sent == TRUE) { crm_err("Already sent notifications for '%s of %s by %s' (for=%s@%s.%.8s, state=%d): %s", op->action, op->target, op->delegate?op->delegate:"", op->client_name, op->originator, op->id, op->state, pcmk_strerror(rc)); goto remote_op_done_cleanup; } if(!op->delegate && data) { op->delegate = crm_element_value_copy(data, F_ORIG); } if(data == NULL) { data = create_xml_node(NULL, "remote-op"); local_data = data; } /* Tell everyone the operation is done, we will continue * with doing the local notifications once we receive * the broadcast back. */ subt = crm_element_value(data, F_SUBTYPE); if(dup == FALSE && safe_str_neq(subt, "broadcast")) { /* Defer notification until the bcast message arrives */ bcast_result_to_peers(op, rc); goto remote_op_done_cleanup; } if(rc == pcmk_ok || dup) { level = LOG_NOTICE; } else if(safe_str_neq(op->originator, stonith_our_uname)) { level = LOG_NOTICE; } do_crm_log(level, "Operation %s of %s by %s for %s@%s.%.8s: %s", op->action, op->target, op->delegate?op->delegate:"", op->client_name, op->originator, op->id, pcmk_strerror(rc)); handle_local_reply_and_notify(op, data, rc); if (dup == FALSE) { handle_duplicates(op, data, rc); } /* Free non-essential parts of the record * Keep the record around so we can query the history */ if(op->query_results) { g_list_free_full(op->query_results, free_remote_query); op->query_results = NULL; } if(op->request) { free_xml(op->request); op->request = NULL; } remote_op_done_cleanup: free_xml(local_data); } static gboolean remote_op_timeout_one(gpointer userdata) { remote_fencing_op_t *op = userdata; op->op_timer_one = 0; crm_notice("Remote %s operation on %s for %s.%8s timed out", op->action, op->target, op->client_name, op->id); call_remote_stonith(op, NULL); return FALSE; } static gboolean remote_op_timeout(gpointer userdata) { remote_fencing_op_t *op = userdata; op->op_timer_total = 0; if(op->state == st_done) { crm_debug("Action %s (%s) for %s (%s) already completed", op->action, op->id, op->target, op->client_name); return FALSE; } crm_debug("Action %s (%s) for %s (%s) timed out", op->action, op->id, op->target, op->client_name); op->state = st_failed; remote_op_done(op, NULL, -ETIME, FALSE); return FALSE; } static gboolean remote_op_query_timeout(gpointer data) { remote_fencing_op_t *op = data; op->query_timer = 0; if(op->state == st_done) { crm_debug("Operation %s for %s already completed", op->id, op->target); } else if(op->state == st_exec) { crm_debug("Operation %s for %s already in progress", op->id, op->target); } else if(op->query_results) { crm_debug("Query %s for %s complete: %d", op->id, op->target, op->state); call_remote_stonith(op, NULL); } else { crm_debug("Query %s for %s timed out: %d", op->id, op->target, op->state); if(op->op_timer_total) { g_source_remove(op->op_timer_total); op->op_timer_total = 0; } remote_op_timeout(op); } return FALSE; } static int stonith_topology_next(remote_fencing_op_t *op) { stonith_topology_t *tp = NULL; if(op->target) { /* Queries don't have a target set */ tp = g_hash_table_lookup(topology, op->target); } if(tp == NULL) { return pcmk_ok; } set_bit(op->call_options, st_opt_topology); do { op->level++; } while(op->level < ST_LEVEL_MAX && tp->levels[op->level] == NULL); if(op->level < ST_LEVEL_MAX) { crm_trace("Attempting fencing level %d for %s (%d devices) - %s@%s.%.8s", op->level, op->target, g_list_length(tp->levels[op->level]), op->client_name, op->originator, op->id); op->devices = tp->levels[op->level]; return pcmk_ok; } crm_notice("All fencing options to fence %s for %s@%s.%.8s failed", op->target, op->client_name, op->originator, op->id); return -EINVAL; } /*! * \brief Check to see if this operation is a duplicate of another in flight * operation. If so merge this operation into the inflight operation, and mark * it as a duplicate. */ static void merge_duplicates(remote_fencing_op_t *op) { GHashTableIter iter; remote_fencing_op_t *other = NULL; g_hash_table_iter_init(&iter, remote_op_list); while(g_hash_table_iter_next(&iter, NULL, (void**)&other)) { if(other->state > st_exec) { /* Must be in-progress */ continue; } else if (safe_str_neq(op->target, other->target)) { /* Must be for the same node */ continue; } else if(safe_str_neq(op->action, other->action)) { crm_trace("Must be for the same action: %s vs. ", op->action, other->action); continue; } else if(safe_str_eq(op->client_name, other->client_name)) { crm_trace("Must be for different clients: %s", op->client_name); continue; } else if(safe_str_eq(other->target, other->originator)) { crm_trace("Can't be a suicide operation: %s", other->target); continue; } /* There is another in-flight request to fence the same host * Piggyback on that instead. If it fails, so do we. */ other->duplicates = g_list_append(other->duplicates, op); if(other->total_timeout == 0) { crm_trace("Making a best-guess as to the timeout used"); other->total_timeout = op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL, op->base_timeout); } crm_notice("Merging stonith action %s for node %s originating from client %s.%.8s with identical request from %s@%s.%.8s (%ds)", op->action, op->target, op->client_name, op->id, other->client_name, other->originator, other->id, other->total_timeout); report_timeout_period(op, other->total_timeout); op->state = st_duplicate; } } /*! * \internal * \brief Create a new remote stonith op * \param client, he local stonith client id that initaited the operation * \param request, The request from the client that started the operation * \param peer, Is this operation owned by another stonith peer? Operations * owned by other peers are stored on all the stonith nodes, but only the * owner executes the operation. All the nodes get the results to the operation * once the owner finishes executing it. */ void *create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer) { remote_fencing_op_t *op = NULL; xmlNode *dev = get_xpath_object("//@"F_STONITH_TARGET, request, LOG_TRACE); if(remote_op_list == NULL) { remote_op_list = g_hash_table_new_full( crm_str_hash, g_str_equal, NULL, free_remote_op); } /* If this operation is owned by another node, check to make * sure we haven't already created this operation. */ if(peer && dev) { const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); CRM_CHECK(op_id != NULL, return NULL); op = g_hash_table_lookup(remote_op_list, op_id); if(op) { crm_debug("%s already exists", op_id); return op; } } op = calloc(1, sizeof(remote_fencing_op_t)); crm_element_value_int(request, F_STONITH_TIMEOUT, (int*)&(op->base_timeout)); if(peer && dev) { op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID); } else { op->id = crm_generate_uuid(); } g_hash_table_replace(remote_op_list, op->id, op); CRM_LOG_ASSERT(g_hash_table_lookup(remote_op_list, op->id) != NULL); op->state = st_query; op->action = crm_element_value_copy(dev, F_STONITH_ACTION); op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN); if(op->originator == NULL) { /* Local or relayed request */ op->originator = strdup(stonith_our_uname); } if(client) { op->client_id = strdup(client); } op->client_name = crm_element_value_copy(request, F_STONITH_CLIENTNAME); op->target = crm_element_value_copy(dev, F_STONITH_TARGET); op->request = copy_xml(request); /* TODO: Figure out how to avoid this */ crm_element_value_int(request, F_STONITH_CALLOPTS, (int*)&(op->call_options)); crm_trace("%s new stonith op: %s - %s of %s for %s", (peer && dev)?"Recorded":"Generated", op->id, op->action, op->target, op->client_name); if(op->call_options & st_opt_cs_nodeid) { int nodeid = crm_atoi(op->target, NULL); crm_node_t *node = crm_get_peer(nodeid, NULL); /* Ensure the conversion only happens once */ op->call_options &= ~st_opt_cs_nodeid; if(node && node->uname) { free(op->target); op->target = strdup(node->uname); } else { crm_warn("Could not expand nodeid '%s' into a host name (%p)", op->target, node); } } /* check to see if this is a duplicate operation of another in-flight operation */ merge_duplicates(op); return op; } remote_fencing_op_t *initiate_remote_stonith_op(stonith_client_t *client, xmlNode *request, gboolean manual_ack) { xmlNode *query = NULL; const char *client_id = NULL; remote_fencing_op_t *op = NULL; if(client) { client_id = client->id; } else { client_id = crm_element_value(request, F_STONITH_CLIENTID); } CRM_LOG_ASSERT(client_id != NULL); op = create_remote_stonith_op(client_id, request, FALSE); op->owner = TRUE; CRM_CHECK(op->action, return NULL); if(stonith_topology_next(op) != pcmk_ok) { op->state = st_failed; } switch(op->state) { case st_failed: crm_warn("Initiation of remote operation %s for %s: failed (%s)", op->action, op->target, op->id); remote_op_done(op, NULL, -EINVAL, FALSE); return op; case st_duplicate: crm_info("Initiating remote operation %s for %s: %s (duplicate)", op->action, op->target, op->id); return op; default: crm_notice("Initiating remote operation %s for %s: %s (%d)", op->action, op->target, op->id, op->state); } query = stonith_create_op(0, op->id, STONITH_OP_QUERY, NULL, 0); if(!manual_ack) { - op->query_timer = g_timeout_add(100*op->base_timeout, remote_op_query_timeout, op); + int query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR; + op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op); } else { crm_xml_add(query, F_STONITH_DEVICE, "manual_ack"); } crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id); crm_xml_add(query, F_STONITH_TARGET, op->target); crm_xml_add(query, F_STONITH_ACTION, op->action); crm_xml_add(query, F_STONITH_ORIGIN, op->originator); crm_xml_add(query, F_STONITH_CLIENTID, op->client_id); crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name); crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout); send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE); free_xml(query); return op; } static gint sort_strings(gconstpointer a, gconstpointer b) { return strcmp(a, b); } enum find_best_peer_options { /*! Skip checking the target peer for capable fencing devices */ FIND_PEER_SKIP_TARGET = 0x0001, /*! Only check the target peer for capable fencing devices */ FIND_PEER_TARGET_ONLY = 0x0002, /*! Skip peers and devices that are not verified */ FIND_PEER_VERIFIED_ONLY = 0x0004, }; static st_query_result_t * find_best_peer(const char *device, remote_fencing_op_t *op, enum find_best_peer_options options) { GListPtr iter = NULL; gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE; if (!device && is_set(op->call_options, st_opt_topology)) { return NULL; } for(iter = op->query_results; iter != NULL; iter = iter->next) { st_query_result_t *peer = iter->data; if ((options & FIND_PEER_SKIP_TARGET) && safe_str_eq(peer->host, op->target)) { continue; } if ((options & FIND_PEER_TARGET_ONLY) && safe_str_neq(peer->host, op->target)) { continue; } if(is_set(op->call_options, st_opt_topology)) { /* Do they have the next device of the current fencing level? */ GListPtr match = NULL; if (verified_devices_only && !g_hash_table_lookup(peer->verified_devices, device)) { continue; } match = g_list_find_custom(peer->device_list, device, sort_strings); if(match) { crm_trace("Removing %s from %s (%d remaining)", (char*)match->data, peer->host, g_list_length(peer->device_list)); peer->device_list = g_list_remove(peer->device_list, match->data); return peer; } } else if(peer->devices > 0) { if (verified_devices_only && !g_hash_table_size(peer->verified_devices)) { continue; } /* No topology: Use the current best peer */ crm_trace("Simple fencing"); return peer; } } return NULL; } static st_query_result_t *stonith_choose_peer(remote_fencing_op_t *op) { st_query_result_t *peer = NULL; const char *device = NULL; do { if(op->devices) { device = op->devices->data; crm_trace("Checking for someone to fence %s with %s", op->target, (char*)op->devices->data); } else { crm_trace("Checking for someone to fence %s", op->target); } if ((peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET | FIND_PEER_VERIFIED_ONLY))) { return peer; } else if ((peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET))) { return peer; } else if ((peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY))) { return peer; } /* Try the next fencing level if there is one */ } while(is_set(op->call_options, st_opt_topology) && stonith_topology_next(op) == pcmk_ok); if(op->devices) { crm_debug("Couldn't find anyone to fence %s with %s", op->target, (char*)op->devices->data); } else { crm_debug("Couldn't find anyone to fence %s", op->target); } return NULL; } static int get_device_timeout(st_query_result_t *peer, const char *device, int default_timeout) { gpointer res; if (!peer || !device) { return default_timeout; } res = g_hash_table_lookup(peer->custom_action_timeouts, device); return res ? GPOINTER_TO_INT(res) : default_timeout; } static int get_op_total_timeout(remote_fencing_op_t *op, st_query_result_t *chosen_peer, int default_timeout) { stonith_topology_t *tp = g_hash_table_lookup(topology, op->target); int total_timeout = 0; if (is_set(op->call_options, st_opt_topology) && tp) { int i; GListPtr device_list = NULL; GListPtr iter = NULL; /* Yep, this looks scary, nested loops all over the place. * Here is what is going on. * Loop1: Iterate through fencing levels. * Loop2: If a fencing level has devices, loop through each device * Loop3: For each device in a fencing level, see what peer owns it * and what that peer has reported the timeout is for the device. */ for (i = 0; i < ST_LEVEL_MAX; i++) { if (!tp->levels[i]) { continue; } for (device_list = tp->levels[i]; device_list; device_list = device_list->next) { for(iter = op->query_results; iter != NULL; iter = iter->next) { st_query_result_t *peer = iter->data; if (g_list_find_custom(peer->device_list, device_list->data, sort_strings)) { total_timeout += get_device_timeout(peer, device_list->data, default_timeout); break; } } /* End Loop3: match device with peer that owns device, find device's timeout period */ } /* End Loop2: iterate through devices at a specific level */ } /*End Loop1: iterate through fencing levels */ } else if (chosen_peer) { GListPtr cur = NULL; for (cur = chosen_peer->device_list; cur; cur = cur->next) { total_timeout += get_device_timeout(chosen_peer, cur->data, default_timeout); } } else { total_timeout = default_timeout; } return total_timeout ? total_timeout : default_timeout; } static void report_timeout_period(remote_fencing_op_t *op, int op_timeout) { GListPtr iter = NULL; xmlNode *update = NULL; const char *client_node = NULL; const char *client_id = NULL; const char *call_id = NULL; if (op->call_options & st_opt_sync_call) { /* There is no reason to report the timeout for a syncronous call. It * is impossible to use the reported timeout to do anything when the client * is blocking for the response. This update is only important for * async calls that require a callback to report the results in. */ return; } else if (!op->request) { return; } crm_trace("Reporting timeout for %s.%.8s", op->client_name, op->id); client_node = crm_element_value(op->request, F_STONITH_CLIENTNODE); call_id = crm_element_value(op->request, F_STONITH_CALLID); client_id = crm_element_value(op->request, F_STONITH_CLIENTID); if (!client_node || !call_id || !client_id) { return; } if (safe_str_eq(client_node, stonith_our_uname)) { /* The client is connected to this node, send the update direclty to them */ do_stonith_async_timeout_update(client_id, call_id, op_timeout); return; } /* The client is connected to another node, relay this update to them */ update = stonith_create_op(0, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0); crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id); crm_xml_add(update, F_STONITH_CLIENTID, client_id); crm_xml_add(update, F_STONITH_CALLID, call_id); crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout); send_cluster_message(crm_get_peer(0, client_node), crm_msg_stonith_ng, update, FALSE); free_xml(update); for(iter = op->duplicates; iter != NULL; iter = iter->next) { remote_fencing_op_t *dup = iter->data; crm_trace("Reporting timeout for duplicate %s.%.8s", dup->client_name, dup->id); report_timeout_period(iter->data, op_timeout); } } void call_remote_stonith(remote_fencing_op_t *op, st_query_result_t *peer) { const char *device = NULL; int timeout = op->base_timeout; if(peer == NULL && !is_set(op->call_options, st_opt_topology)) { peer = stonith_choose_peer(op); } if(!op->op_timer_total) { int t = get_op_total_timeout(op, peer, op->base_timeout); op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * t; op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op); report_timeout_period(op, op->total_timeout); crm_info("Total remote op timeout set to %d for fencing of node %s for %s.%.8s", t, op->target, op->client_name, op->id); } if(is_set(op->call_options, st_opt_topology) && op->devices) { /* Ignore any preference, they might not have the device we need */ /* When using topology, the stonith_choose_peer function pops off * the peer from the op's query results. Make sure to calculate * the op_timeout before calling this function when topology is in use */ peer = stonith_choose_peer(op); device = op->devices->data; timeout = get_device_timeout(peer, device, op->base_timeout); } if(peer) { int t = TIMEOUT_MULTIPLY_FACTOR * get_device_timeout(peer, device, op->base_timeout); xmlNode *query = stonith_create_op(0, op->id, STONITH_OP_FENCE, NULL, 0); crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id); crm_xml_add(query, F_STONITH_TARGET, op->target); crm_xml_add(query, F_STONITH_ACTION, op->action); crm_xml_add(query, F_STONITH_ORIGIN, op->originator); crm_xml_add(query, F_STONITH_CLIENTID, op->client_id); crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name); crm_xml_add_int(query, F_STONITH_TIMEOUT, timeout); if(device) { crm_info("Requesting that %s perform op %s %s with %s for %s (%ds)", peer->host, op->action, op->target, device, op->client_name, t); crm_xml_add(query, F_STONITH_DEVICE, device); crm_xml_add(query, F_STONITH_MODE, "slave"); } else { crm_info("Requesting that %s perform op %s %s for %s (%ds)", peer->host, op->action, op->target, op->client_name, t); crm_xml_add(query, F_STONITH_MODE, "smart"); } op->state = st_exec; if(op->op_timer_one) { g_source_remove(op->op_timer_one); } op->op_timer_one = g_timeout_add((1000 * t), remote_op_timeout_one, op); send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, query, FALSE); free_xml(query); return; } else if(!op->owner) { crm_err("The termination of %s for %s is not ours to control", op->target, op->client_name); } else if(!op->query_timer) { CRM_LOG_ASSERT(op->state < st_done); /* We've exhausted all available peers */ crm_info("No remaining peers capable of terminating %s for %s", op->target, op->client_name, op->state); remote_op_timeout(op); } else if(device) { crm_info("Waiting for additional peers capable of terminating %s with %s for %s.%.8s", op->target, device, op->client_name, op->id); } else { crm_info("Waiting for additional peers capable of terminating %s for %s%.8s", op->target, op->client_name, op->id); } free_remote_query(peer); } static gint sort_peers(gconstpointer a, gconstpointer b) { const st_query_result_t *peer_a = a; const st_query_result_t *peer_b = a; if(peer_a->devices > peer_b->devices) { return -1; } else if(peer_a->devices > peer_b->devices) { return 1; } return 0; } /*! * \internal * \brief Determine if all the devices in the topology are found or not */ static gboolean all_topology_devices_found(remote_fencing_op_t *op) { GListPtr device = NULL; GListPtr iter = NULL; GListPtr match = NULL; stonith_topology_t *tp = NULL; gboolean skip_target = FALSE; int i; tp = g_hash_table_lookup(topology, op->target); if (!tp) { return FALSE; } if (safe_str_eq(op->action, "off") || safe_str_eq(op->action, "reboot")) { /* Don't count the devices on the target node if we are killing * the target node. */ skip_target = TRUE; } for (i = 0; i < ST_LEVEL_MAX; i++) { for (device = tp->levels[i]; device; device = device->next) { match = FALSE; for(iter = op->query_results; iter != NULL; iter = iter->next) { st_query_result_t *peer = iter->data; if (skip_target && safe_str_eq(peer->host, op->target)) { continue; } match = g_list_find_custom(peer->device_list, device->data, sort_strings); } if (!match) { return FALSE; } } } return TRUE; } int process_remote_stonith_query(xmlNode *msg) { int devices = 0; const char *id = NULL; const char *host = NULL; remote_fencing_op_t *op = NULL; st_query_result_t *result = NULL; xmlNode *dev = get_xpath_object("//@"F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); xmlNode *child = NULL; CRM_CHECK(dev != NULL, return -EPROTO); id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); CRM_CHECK(id != NULL, return -EPROTO); dev = get_xpath_object("//@st-available-devices", msg, LOG_ERR); CRM_CHECK(dev != NULL, return -EPROTO); crm_element_value_int(dev, "st-available-devices", &devices); op = g_hash_table_lookup(remote_op_list, id); if(op == NULL) { crm_debug("Unknown or expired remote op: %s", id); return -EOPNOTSUPP; } op->replies++; host = crm_element_value(msg, F_ORIG); if(devices <= 0) { /* If we're doing 'known' then we might need to fire anyway */ crm_trace("Query result from %s (%d devices)", host, devices); return pcmk_ok; } else if(op->call_options & st_opt_allow_suicide) { crm_trace("Allowing %s to potentialy fence itself", op->target); } else if(safe_str_eq(host, op->target)) { crm_info("Ignoring reply from %s, hosts are not permitted to commit suicide", op->target); return pcmk_ok; } crm_debug("Query result from %s (%d devices)", host, devices); result = calloc(1, sizeof(st_query_result_t)); result->host = strdup(host); result->devices = devices; result->custom_action_timeouts = g_hash_table_new_full( crm_str_hash, g_str_equal, free, NULL); result->verified_devices = g_hash_table_new_full( crm_str_hash, g_str_equal, free, NULL); for (child = __xml_first_child(dev); child != NULL; child = __xml_next(child)) { const char *device = ID(child); int action_timeout = 0; int verified = 0; if(device) { result->device_list = g_list_prepend(result->device_list, strdup(device)); crm_element_value_int(child, F_STONITH_ACTION_TIMEOUT, &action_timeout); crm_element_value_int(child, F_STONITH_DEVICE_VERIFIED, &verified); if (action_timeout) { crm_trace("Peer %s with device %s returned action timeout %d", result->host, device, action_timeout); g_hash_table_insert(result->custom_action_timeouts, strdup(device), GINT_TO_POINTER(action_timeout)); } if (verified) { crm_trace("Peer %s has confirmed a verified device %s", result->host, device); g_hash_table_insert(result->verified_devices, strdup(device), GINT_TO_POINTER(verified)); } } } CRM_CHECK(devices == g_list_length(result->device_list), crm_err("Mis-match: Query claimed to have %d devices but %d found", devices, g_list_length(result->device_list))); op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers); /* All the query results are in for the topology, start the fencing ops. */ if(is_set(op->call_options, st_opt_topology)) { /* If we start the fencing before all the topology results are in, * it is possible fencing levels will be skipped because of the missing * query results. */ if (op->state == st_query && all_topology_devices_found(op)) { call_remote_stonith(op, result); } /* We have a result for a non-topology fencing op, start fencing */ } else if(op->state == st_query) { call_remote_stonith(op, result); } else if(op->state == st_done) { crm_info("Discarding query result from %s (%d devices): Operation is in state %d", result->host, result->devices, op->state); } return pcmk_ok; } int process_remote_stonith_exec(xmlNode *msg) { int rc = 0; const char *id = NULL; const char *device = NULL; remote_fencing_op_t *op = NULL; xmlNode *dev = get_xpath_object("//@"F_STONITH_REMOTE_OP_ID, msg, LOG_ERR); CRM_CHECK(dev != NULL, return -EPROTO); id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID); CRM_CHECK(id != NULL, return -EPROTO); dev = get_xpath_object("//@"F_STONITH_RC, msg, LOG_ERR); CRM_CHECK(dev != NULL, return -EPROTO); crm_element_value_int(dev, F_STONITH_RC, &rc); device = crm_element_value(dev, F_STONITH_DEVICE); if(remote_op_list) { op = g_hash_table_lookup(remote_op_list, id); } if(op == NULL && rc == pcmk_ok) { /* Record successful fencing operations */ const char *client_id = crm_element_value(msg, F_STONITH_CLIENTID); op = create_remote_stonith_op(client_id, msg, TRUE); } if(op == NULL) { /* Could be for an event that began before we started */ /* TODO: Record the op for later querying */ crm_info("Unknown or expired remote op: %s", id); return -EOPNOTSUPP; } if (op->devices && device && safe_str_neq(op->devices->data, device)) { crm_err("Received outdated reply for device %s to %s node %s. Operation already timed out at remote level.", device, op->action, op->target); return rc; } if(safe_str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast")) { crm_debug("Marking call to %s for %s on behalf of %s@%s.%.8s: %s (%d)", op->action, op->target, op->client_name, op->id, op->originator, rc == pcmk_ok?"passed":"failed", rc); if(rc == pcmk_ok) { op->state = st_done; } else { op->state = st_failed; } remote_op_done(op, msg, rc, FALSE); return pcmk_ok; + } else if(safe_str_neq(op->originator, stonith_our_uname)) { + /* If this isn't a remote level broadcast, and we are not the + * originator of the operation, we should not be receiving this msg. */ + crm_err("%s received non-broadcast fencing result for operation it does not own (device %s targeting %s)", + stonith_our_uname, device, op->target); + return rc; } - + if(is_set(op->call_options, st_opt_topology)) { const char *device = crm_element_value(msg, F_STONITH_DEVICE); crm_notice("Call to %s for %s on behalf of %s@%s: %s (%d)", device, op->target, op->client_name, op->originator, rc == pcmk_ok?"passed":"failed", rc); - if(safe_str_eq(op->originator, stonith_our_uname)) { - if(op->state == st_done) { - remote_op_done(op, msg, rc, FALSE); - return rc; + /* We own the op, and it is complete. broadcast the result to all nodes + * and notify our local clients. */ + if(op->state == st_done) { + remote_op_done(op, msg, rc, FALSE); + return rc; - } else if(rc == pcmk_ok && op->devices) { + } + + /* An operation completed succesfully but has not yet been marked as done. + * Continue the topology if more devices exist at the current level, otherwise + * mark as done. */ + if(rc == pcmk_ok) { + if (op->devices) { /* Success, are there any more? */ op->devices = op->devices->next; } - + /* if no more devices at this fencing level, we are done, + * else we need to contine with executing the next device in the list */ if(op->devices == NULL) { crm_trace("Marking complex fencing op for %s as complete", op->target); - if(rc == pcmk_ok) { - op->state = st_done; - } else { - op->state = st_failed; - } + op->state = st_done; remote_op_done(op, msg, rc, FALSE); return rc; } - } else { - op->state = st_done; - remote_op_done(op, msg, rc, FALSE); - return rc; + /* This device failed, time to try another topology level. If no other + * levels are available, mark this operation as failed and report results. */ + if (stonith_topology_next(op) != pcmk_ok) { + op->state = st_failed; + remote_op_done(op, msg, rc, FALSE); + } } - } else if(rc == pcmk_ok && op->devices == NULL) { crm_trace("All done for %s", op->target); op->state = st_done; remote_op_done(op, msg, rc, FALSE); return rc; } /* Retry on failure or execute the rest of the topology */ crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator, op->client_name, rc); call_remote_stonith(op, NULL); return rc; } int stonith_fence_history(xmlNode *msg, xmlNode **output) { int rc = 0; const char *target = NULL; xmlNode *dev = get_xpath_object("//@"F_STONITH_TARGET, msg, LOG_TRACE); if(dev) { int options = 0; target = crm_element_value(dev, F_STONITH_TARGET); crm_element_value_int(msg, F_STONITH_CALLOPTS, &options); if(target && (options & st_opt_cs_nodeid)) { int nodeid = crm_atoi(target, NULL); crm_node_t *node = crm_get_peer(nodeid, NULL); if(node) { target = node->uname; } } } *output = create_xml_node(NULL, F_STONITH_HISTORY_LIST); if (remote_op_list) { GHashTableIter iter; remote_fencing_op_t *op = NULL; g_hash_table_iter_init(&iter, remote_op_list); while(g_hash_table_iter_next(&iter, NULL, (void**)&op)) { xmlNode *entry = NULL; if (target && strcmp(op->target, target) != 0) { continue; } rc = 0; entry = create_xml_node(*output, STONITH_OP_EXEC); crm_xml_add(entry, F_STONITH_TARGET, op->target); crm_xml_add(entry, F_STONITH_ACTION, op->action); crm_xml_add(entry, F_STONITH_ORIGIN, op->originator); crm_xml_add(entry, F_STONITH_DELEGATE, op->delegate); crm_xml_add_int(entry, F_STONITH_DATE, op->completed); crm_xml_add_int(entry, F_STONITH_STATE, op->state); } } return rc; } gboolean stonith_check_fence_tolerance(int tolerance, const char *target, const char *action) { GHashTableIter iter; time_t now = time(NULL); remote_fencing_op_t *rop = NULL; crm_trace("tolerance=%d, remote_op_list=%p", tolerance, remote_op_list); if (tolerance <= 0 || !remote_op_list || target == NULL || action == NULL) { return FALSE; } g_hash_table_iter_init(&iter, remote_op_list); while(g_hash_table_iter_next(&iter, NULL, (void**)&rop)) { if(strcmp(rop->target, target) != 0) { continue; } else if(rop->state != st_done) { continue; } else if(strcmp(rop->action, action) != 0) { continue; } else if((rop->completed + tolerance) < now) { continue; } crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s", target, action, tolerance, rop->delegate, rop->originator); return TRUE; } return FALSE; } diff --git a/include/crm/cib.h b/include/crm/cib.h index 5e1c30ab45..978bfc7703 100644 --- a/include/crm/cib.h +++ b/include/crm/cib.h @@ -1,171 +1,178 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/** + * \file + * \brief Cluster Configuration + * \ingroup cib + */ + #ifndef CIB__H # define CIB__H # include # include # define CIB_FEATURE_SET "2.0" /* use compare_version() for doing comparisons */ enum cib_variant { cib_undefined, cib_native, cib_file, cib_remote, cib_database, }; enum cib_state { cib_connected_command, cib_connected_query, cib_disconnected }; enum cib_conn_type { cib_command, cib_query, cib_no_connection, cib_command_nonblocking, }; /* *INDENT-OFF* */ enum cib_call_options { cib_none = 0x00000000, cib_verbose = 0x00000001, cib_xpath = 0x00000002, cib_multiple = 0x00000004, cib_can_create = 0x00000008, cib_discard_reply = 0x00000010, cib_no_children = 0x00000020, cib_scope_local = 0x00000100, cib_dryrun = 0x00000200, cib_sync_call = 0x00001000, cib_inhibit_notify = 0x00010000, cib_quorum_override = 0x00100000, cib_inhibit_bcast = 0x01000000, cib_force_diff = 0x10000000 }; #define cib_default_options = cib_none #define T_CIB_DIFF_NOTIFY "cib_diff_notify" /* *INDENT-ON* */ typedef struct cib_s cib_t; typedef struct cib_api_operations_s { int (*signon) (cib_t * cib, const char *name, enum cib_conn_type type); int (*signon_raw) (cib_t * cib, const char *name, enum cib_conn_type type, int *event_fd); int (*signoff) (cib_t * cib); int (*free) (cib_t * cib); int (*set_op_callback) (cib_t * cib, void (*callback) (const xmlNode * msg, int callid, int rc, xmlNode * output)); int (*add_notify_callback) (cib_t * cib, const char *event, void (*callback) (const char *event, xmlNode * msg)); int (*del_notify_callback) (cib_t * cib, const char *event, void (*callback) (const char *event, xmlNode * msg)); int (*set_connection_dnotify) (cib_t * cib, void (*dnotify) (gpointer user_data)); int (*inputfd) (cib_t * cib); int (*noop) (cib_t * cib, int call_options); int (*ping) (cib_t * cib, xmlNode ** output_data, int call_options); int (*query) (cib_t * cib, const char *section, xmlNode ** output_data, int call_options); int (*query_from) (cib_t * cib, const char *host, const char *section, xmlNode ** output_data, int call_options); int (*is_master) (cib_t * cib); int (*set_master) (cib_t * cib, int call_options); int (*set_slave) (cib_t * cib, int call_options); int (*set_slave_all) (cib_t * cib, int call_options); int (*sync) (cib_t * cib, const char *section, int call_options); int (*sync_from) (cib_t * cib, const char *host, const char *section, int call_options); int (*upgrade) (cib_t * cib, int call_options); int (*bump_epoch) (cib_t * cib, int call_options); int (*create) (cib_t * cib, const char *section, xmlNode * data, int call_options); int (*modify) (cib_t * cib, const char *section, xmlNode * data, int call_options); int (*update) (cib_t * cib, const char *section, xmlNode * data, int call_options); int (*replace) (cib_t * cib, const char *section, xmlNode * data, int call_options); int (*delete) (cib_t * cib, const char *section, xmlNode * data, int call_options); int (*erase) (cib_t * cib, xmlNode ** output_data, int call_options); int (*delete_absolute) (cib_t * cib, const char *section, xmlNode * data, int call_options); int (*quit) (cib_t * cib, int call_options); int (*register_notification) (cib_t * cib, const char *callback, int enabled); gboolean(*register_callback) (cib_t * cib, int call_id, int timeout, gboolean only_success, void *user_data, const char *callback_name, void (*callback) (xmlNode *, int, int, xmlNode *, void *)); } cib_api_operations_t; struct cib_s { enum cib_state state; enum cib_conn_type type; enum cib_variant variant; int call_id; int call_timeout; void *variant_opaque; void *delegate_fn; GList *notify_list; void (*op_callback) (const xmlNode * msg, int call_id, int rc, xmlNode * output); cib_api_operations_t *cmds; }; /* Core functions */ cib_t *cib_new(void); cib_t *cib_native_new(void); cib_t *cib_file_new(const char *filename); cib_t *cib_remote_new(const char *server, const char *user, const char *passwd, int port, gboolean encrypted); cib_t *cib_new_no_shadow(void); char *get_shadow_file(const char *name); cib_t *cib_shadow_new(const char *name); void cib_delete(cib_t * cib); void cib_dump_pending_callbacks(void); int num_cib_op_callbacks(void); void remove_cib_op_callback(int call_id, gboolean all_callbacks); # define add_cib_op_callback(cib, id, flag, data, fn) cib->cmds->register_callback(cib, id, 120, flag, data, #fn, fn) # include # define CIB_LIBRARY "libcib.so.2" #endif diff --git a/include/crm/common/ipc.h b/include/crm/common/ipc.h index e08b8e64cc..46bf50a47e 100644 --- a/include/crm/common/ipc.h +++ b/include/crm/common/ipc.h @@ -1,82 +1,88 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRM_COMMON_IPC__H # define CRM_COMMON_IPC__H +/** + * \file + * \brief Wrappers for and extensions to libqb IPC + * \ingroup core + */ + # include /* clplumbing based IPC */ # define create_reply(request, xml_response_data) create_reply_adv(request, xml_response_data, __FUNCTION__); xmlNode *create_reply_adv(xmlNode * request, xmlNode * xml_response_data, const char *origin); # define create_request(task, xml_data, host_to, sys_to, sys_from, uuid_from) create_request_adv(task, xml_data, host_to, sys_to, sys_from, uuid_from, __FUNCTION__) xmlNode *create_request_adv(const char *task, xmlNode * xml_data, const char *host_to, const char *sys_to, const char *sys_from, const char *uuid_from, const char *origin); /* Libqb based IPC */ #include enum crm_ipc_server_flags { crm_ipc_server_none = 0x0000, crm_ipc_server_event = 0x0001, /* Send an Event instead of a Response */ crm_ipc_server_info = 0x0010, /* Log failures as LOG_INFO */ crm_ipc_server_error = 0x0020, /* Log failures as LOG_ERR */ }; enum crm_ipc_flags { crm_ipc_client_none = 0x0000, crm_ipc_client_response = 0x0001, /* A Response is expected in reply */ }; void crm_ipcs_send_ack(qb_ipcs_connection_t *c, uint32_t request, const char *tag, const char *function, int line); ssize_t crm_ipcs_send(qb_ipcs_connection_t *c, uint32_t request, xmlNode *message, enum crm_ipc_server_flags flags); xmlNode *crm_ipcs_recv(qb_ipcs_connection_t *c, void *data, size_t size, uint32_t *id, uint32_t *flags); int crm_ipcs_client_pid(qb_ipcs_connection_t *c); #include typedef struct crm_ipc_s crm_ipc_t; crm_ipc_t *crm_ipc_new(const char *name, size_t max_size); bool crm_ipc_connect(crm_ipc_t *client); void crm_ipc_close(crm_ipc_t *client); void crm_ipc_destroy(crm_ipc_t *client); int crm_ipc_send(crm_ipc_t *client, xmlNode *message, enum crm_ipc_flags flags, int32_t ms_timeout, xmlNode **reply); int crm_ipc_get_fd(crm_ipc_t *client); bool crm_ipc_connected(crm_ipc_t *client); int crm_ipc_ready(crm_ipc_t *client); long crm_ipc_read(crm_ipc_t *client); const char *crm_ipc_buffer(crm_ipc_t *client); const char *crm_ipc_name(crm_ipc_t *client); /* Utils */ xmlNode *create_hello_message(const char *uuid, const char *client_name, const char *major_version, const char *minor_version); #endif diff --git a/include/crm/common/iso8601.h b/include/crm/common/iso8601.h index ac77700aee..22cc44c9f1 100644 --- a/include/crm/common/iso8601.h +++ b/include/crm/common/iso8601.h @@ -1,114 +1,120 @@ /* * Copyright (C) 2005 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +/** + * \file + * \brief ISO_8601 Date handling + * \ingroup date + */ + /* * http://en.wikipedia.org/wiki/ISO_8601 * */ #ifndef CRM_COMMON_ISO8601 # define CRM_COMMON_ISO8601 # include # include # include typedef struct crm_time_s crm_time_t; typedef struct crm_time_period_s { crm_time_t *start; crm_time_t *end; crm_time_t *diff; } crm_time_period_t; /* Creates a new date/time object conforming to iso8601: * http://en.wikipedia.org/wiki/ISO_8601 * * Eg. * Ordinal: 2010-01 12:00:00 +10:00 * Gregorian: 2010-01-01 12:00:00 +10:00 * ISO Week: 2010-W53-6 12:00:00 +10:00 * * Notes: * Only one of date, time is required * If date or timezone is unspecified, they default to the current one * Supplying NULL results in the current date/time * Dashes may be ommitted from dates * Colons may be ommitted from times and timezones * A timezone of 'Z' denoted UTC time */ crm_time_t *crm_time_new(const char *string); void crm_time_free(crm_time_t * dt); char *crm_time_as_string(crm_time_t * dt, int flags); #define crm_time_log(level, prefix, dt, flags) crm_time_log_alias(level, __FILE__, __FUNCTION__, __LINE__, prefix, dt, flags) void crm_time_log_alias(int log_level, const char *file, const char *function, int line, const char *prefix, crm_time_t * date_time, int flags); # define crm_time_log_date 0x001 # define crm_time_log_timeofday 0x002 # define crm_time_log_with_timezone 0x004 # define crm_time_ordinal 0x010 # define crm_time_weeks 0x020 # define crm_time_seconds 0x100 # define crm_time_epoch 0x200 crm_time_t *crm_time_parse_duration(const char *duration_str); crm_time_period_t *crm_time_parse_period(const char *period_str); int crm_time_compare(crm_time_t *dt, crm_time_t * rhs); int crm_time_get_timeofday(crm_time_t *dt, uint32_t *h, uint32_t *m, uint32_t *s); int crm_time_get_timezone(crm_time_t *dt, uint32_t *h, uint32_t *m); int crm_time_get_gregorian(crm_time_t *dt, uint32_t *y, uint32_t *m, uint32_t *d); int crm_time_get_ordinal(crm_time_t *dt, uint32_t *y, uint32_t *d); int crm_time_get_isoweek(crm_time_t *dt, uint32_t *y, uint32_t *w, uint32_t *d); /* Time in seconds since 0000-01-01 00:00:00Z */ unsigned long long int crm_time_get_seconds(crm_time_t * dt); /* Time in seconds since 1970-01-01 00:00:00Z */ unsigned long long int crm_time_get_seconds_since_epoch(crm_time_t * dt); void crm_time_set(crm_time_t * target, crm_time_t * source); void crm_time_set_timet(crm_time_t * target, time_t * source); /* Returns a new time object */ crm_time_t *crm_time_add(crm_time_t * dt, crm_time_t * value); crm_time_t *crm_time_subtract(crm_time_t * dt, crm_time_t * value); /* All crm_time_add_... functions support negative values */ void crm_time_add_seconds(crm_time_t *dt, int value); void crm_time_add_minutes(crm_time_t *dt, int value); void crm_time_add_hours(crm_time_t *dt, int value); void crm_time_add_days(crm_time_t *dt, int value); void crm_time_add_weekdays(crm_time_t *dt, int value); void crm_time_add_weeks(crm_time_t *dt, int value); void crm_time_add_months(crm_time_t *dt, int value); void crm_time_add_years(crm_time_t *dt, int value); void crm_time_add_ordinalyears(crm_time_t *dt, int value); void crm_time_add_weekyears(crm_time_t *dt, int value); /* Useful helper functions */ int crm_time_january1_weekday(int year); int crm_time_weeks_in_year(int year); int crm_time_days_in_month(int month, int year); bool crm_time_leapyear(int year); bool crm_time_check(crm_time_t * dt); #endif diff --git a/include/crm/common/logging.h b/include/crm/common/logging.h index 40b813fe64..d91d8e77a0 100644 --- a/include/crm/common/logging.h +++ b/include/crm/common/logging.h @@ -1,161 +1,168 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/** + * \file + * \brief Wrappers for and extensions to libqb logging + * \ingroup core + */ + #ifndef CRM_LOGGING__H # define CRM_LOGGING__H # include # ifndef LOG_TRACE # define LOG_TRACE LOG_DEBUG+1 # endif # define LOG_DEBUG_2 LOG_TRACE # define LOG_DEBUG_3 LOG_TRACE # define LOG_DEBUG_4 LOG_TRACE # define LOG_DEBUG_5 LOG_TRACE # define LOG_DEBUG_6 LOG_TRACE extern unsigned int crm_log_level; extern gboolean crm_config_error; extern gboolean crm_config_warning; void crm_enable_blackbox(int nsig); void crm_enable_blackbox_tracing(int nsig); void crm_write_blackbox(int nsig, struct qb_log_callsite *callsite); void crm_update_callsites(void); void crm_log_deinit(void); gboolean crm_log_cli_init(const char *entity); gboolean crm_log_init(const char *entity, int level, gboolean daemon, gboolean to_stderr, int argc, char **argv, gboolean quiet); void crm_log_args(int argc, char **argv); gboolean crm_add_logfile(const char *filename); void crm_bump_log_level(int argc, char **argv); void crm_enable_stderr(int enable); gboolean crm_is_callsite_active(struct qb_log_callsite *cs, int level, int tags); int log_data_element(int log_level, const char *file, const char *function, int line, const char *prefix, xmlNode * data, int depth, gboolean formatted); /* returns the old value */ unsigned int set_crm_log_level(unsigned int level); unsigned int get_crm_log_level(void); /* * Throughout the macros below, note the leading, pre-comma, space in the * various ' , ##args' occurences to aid portability across versions of 'gcc'. * http://gcc.gnu.org/onlinedocs/cpp/Variadic-Macros.html#Variadic-Macros */ # define CRM_TRACE_INIT_DATA(name) QB_LOG_INIT_DATA(name) # define do_crm_log(level, fmt, args...) qb_log_from_external_source( __func__, __FILE__, fmt, level, __LINE__, 0, ##args) /* level /MUST/ be a constant or compilation will fail */ # define do_crm_log_unlikely(level, fmt, args...) do { \ static struct qb_log_callsite *trace_cs = NULL; \ if(trace_cs == NULL) { \ trace_cs = qb_log_callsite_get(__func__, __FILE__, fmt, level, __LINE__, 0); \ } \ if (crm_is_callsite_active(trace_cs, level, 0)) { \ qb_log_from_external_source( \ __func__, __FILE__, fmt, level, __LINE__, 0, ##args); \ } \ } while(0) # define CRM_LOG_ASSERT(expr) do { \ if(__unlikely((expr) == FALSE)) { \ static struct qb_log_callsite *core_cs = NULL; \ if(core_cs == NULL) { \ core_cs = qb_log_callsite_get(__func__, __FILE__, "log-assert", LOG_TRACE, __LINE__, 0); \ } \ crm_abort(__FILE__, __PRETTY_FUNCTION__, __LINE__, #expr, \ core_cs?core_cs->targets:FALSE, TRUE); \ } \ } while(0) # define CRM_CHECK(expr, failure_action) do { \ if(__unlikely((expr) == FALSE)) { \ static struct qb_log_callsite *core_cs = NULL; \ if(core_cs == NULL) { \ core_cs = qb_log_callsite_get(__func__, __FILE__, "check-assert", LOG_TRACE, __LINE__, 0); \ } \ crm_abort(__FILE__, __PRETTY_FUNCTION__, __LINE__, #expr, \ core_cs?core_cs->targets:FALSE, TRUE); \ failure_action; \ } \ } while(0) # define do_crm_log_xml(level, text, xml) do { \ static struct qb_log_callsite *xml_cs = NULL; \ if(xml_cs == NULL) { \ xml_cs = qb_log_callsite_get(__func__, __FILE__, "xml-blog", level, __LINE__, 0); \ } \ if (crm_is_callsite_active(xml_cs, level, 0)) { \ log_data_element(level, __FILE__, __PRETTY_FUNCTION__, __LINE__, text, xml, 0, TRUE); \ } \ } while(0) # define do_crm_log_alias(level, file, function, line, fmt, args...) do { \ qb_log_from_external_source(function, file, fmt, level, line, 0, ##args); \ } while(0) # define do_crm_log_always(level, fmt, args...) qb_log(level, "%s: " fmt, __PRETTY_FUNCTION__ , ##args) # define crm_perror(level, fmt, args...) do { \ const char *err = strerror(errno); \ fprintf(stderr, fmt ": %s (%d)\n", ##args, err, errno); \ do_crm_log(level, fmt ": %s (%d)", ##args, err, errno); \ } while(0) # define crm_log_tag(level, tag, fmt, args...) do { \ static struct qb_log_callsite *trace_tag_cs = NULL; \ int converted_tag = g_quark_try_string(tag); \ if(trace_tag_cs == NULL) { \ trace_tag_cs = qb_log_callsite_get(__func__, __FILE__, fmt, level, __LINE__, converted_tag); \ } \ if (crm_is_callsite_active(trace_tag_cs, level, converted_tag)) { \ qb_log_from_external_source( __func__, __FILE__, fmt, level, __LINE__, converted_tag, ##args); \ } \ } while(0) # define crm_crit(fmt, args...) qb_logt(LOG_CRIT, 0, fmt , ##args) # define crm_err(fmt, args...) qb_logt(LOG_ERR, 0, fmt , ##args) # define crm_warn(fmt, args...) qb_logt(LOG_WARNING, 0, fmt , ##args) # define crm_notice(fmt, args...) qb_logt(LOG_NOTICE, 0, fmt , ##args) # define crm_info(fmt, args...) qb_logt(LOG_INFO, 0, fmt , ##args) # define crm_debug(fmt, args...) do_crm_log_unlikely(LOG_DEBUG, fmt , ##args) # define crm_trace(fmt, args...) do_crm_log_unlikely(LOG_TRACE, fmt , ##args) # define crm_log_xml_crit(xml, text) do_crm_log_xml(LOG_CRIT, text, xml) # define crm_log_xml_err(xml, text) do_crm_log_xml(LOG_ERR, text, xml) # define crm_log_xml_warn(xml, text) do_crm_log_xml(LOG_WARNING, text, xml) # define crm_log_xml_notice(xml, text) do_crm_log_xml(LOG_NOTICE, text, xml) # define crm_log_xml_info(xml, text) do_crm_log_xml(LOG_INFO, text, xml) # define crm_log_xml_debug(xml, text) do_crm_log_xml(LOG_DEBUG, text, xml) # define crm_log_xml_trace(xml, text) do_crm_log_xml(LOG_TRACE, text, xml) # define crm_str(x) (const char*)(x?x:"") #endif diff --git a/include/crm/common/mainloop.h b/include/crm/common/mainloop.h index 5024b3ee70..959a73d61b 100644 --- a/include/crm/common/mainloop.h +++ b/include/crm/common/mainloop.h @@ -1,100 +1,106 @@ /* * Copyright (C) 2009 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRM_COMMON_MAINLOOP__H # define CRM_COMMON_MAINLOOP__H +/** + * \file + * \brief Wrappers for and extensions to glib mainloop + * \ingroup core + */ + # include typedef struct trigger_s crm_trigger_t; crm_trigger_t *mainloop_add_trigger(int priority, int(*dispatch) (gpointer user_data), gpointer userdata); void mainloop_set_trigger(crm_trigger_t * source); void mainloop_trigger_complete(crm_trigger_t *trig); gboolean mainloop_destroy_trigger(crm_trigger_t * source); gboolean crm_signal(int sig, void (*dispatch) (int sig)); gboolean mainloop_add_signal(int sig, void (*dispatch) (int sig)); gboolean mainloop_destroy_signal(int sig); #include struct ipc_client_callbacks { int (*dispatch)(const char *buffer, ssize_t length, gpointer userdata); void (*destroy) (gpointer); }; qb_ipcs_service_t *mainloop_add_ipc_server( const char *name, enum qb_ipc_type type, struct qb_ipcs_service_handlers *callbacks); void mainloop_del_ipc_server(qb_ipcs_service_t *server); typedef struct mainloop_io_s mainloop_io_t; mainloop_io_t *mainloop_add_ipc_client( const char *name, int priority, size_t max_size, void *userdata, struct ipc_client_callbacks *callbacks); void mainloop_del_ipc_client(mainloop_io_t *client); crm_ipc_t *mainloop_get_ipc_client(mainloop_io_t *client); struct mainloop_fd_callbacks { int (*dispatch)(gpointer userdata); void (*destroy)(gpointer userdata); }; mainloop_io_t *mainloop_add_fd( const char *name, int priority, int fd, void *userdata, struct mainloop_fd_callbacks *callbacks); void mainloop_del_fd(mainloop_io_t *client); typedef struct mainloop_child_s mainloop_child_t; /* * Create a new tracked process * To track a process group, use -pid */ void mainloop_add_child(pid_t pid, int timeout, const char *desc, void *userdata, void (*callback)(mainloop_child_t* p, int status, int signo, int exitcode)); void * mainloop_get_child_userdata(mainloop_child_t *child); int mainloop_get_child_timeout(mainloop_child_t *child); pid_t mainloop_get_child_pid(mainloop_child_t *child); void mainloop_clear_child_userdata(mainloop_child_t *child); #define G_PRIORITY_MEDIUM (G_PRIORITY_HIGH/2) #endif diff --git a/include/crm/common/util.h b/include/crm/common/util.h index b456c8f508..479c681b81 100644 --- a/include/crm/common/util.h +++ b/include/crm/common/util.h @@ -1,104 +1,110 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRM_COMMON_UTIL__H # define CRM_COMMON_UTIL__H +/** + * \file + * \brief Utility functions + * \ingroup core + */ + # include # include # include # include # include # include # if SUPPORT_HEARTBEAT # include # else # define NORMALNODE "normal" # define ACTIVESTATUS "active"/* fully functional, and all links are up */ # define DEADSTATUS "dead" /* Status of non-working link or machine */ # define PINGSTATUS "ping" /* Status of a working ping node */ # define JOINSTATUS "join" /* Status when an api client joins */ # define LEAVESTATUS "leave" /* Status when an api client leaves */ # define ONLINESTATUS "online"/* Status of an online client */ # define OFFLINESTATUS "offline" /* Status of an offline client */ # endif char *crm_itoa(int an_int); gboolean crm_is_true(const char *s); int crm_str_to_boolean(const char *s, int *ret); int crm_parse_int(const char *text, const char *default_text); long long crm_get_msec(const char *input); unsigned long long crm_get_interval(const char *input); int char2score(const char *score); char *score2char(int score); int compare_version(const char *version1, const char *version2); gboolean parse_op_key(const char *key, char **rsc_id, char **op_type, int *interval); gboolean decode_transition_key( const char *key, char **uuid, int *action, int *transition_id, int *target_rc); gboolean decode_transition_magic( const char *magic, char **uuid, int *transition_id, int *action_id, int *op_status, int *op_rc, int *target_rc); # define safe_str_eq(a, b) crm_str_eq(a, b, FALSE) gboolean crm_str_eq(const char *a, const char *b, gboolean use_case); gboolean safe_str_neq(const char *a, const char *b); # define crm_atoi(text, default_text) crm_parse_int(text, default_text) /* coverity[+kill] */ void crm_abort(const char *file, const char *function, int line, const char *condition, gboolean do_core, gboolean do_fork); static inline gboolean is_not_set(long long word, long long bit) { return ((word & bit) == 0); } static inline gboolean is_set(long long word, long long bit) { return ((word & bit) == bit); } static inline gboolean is_set_any(long long word, long long bit) { return ((word & bit) != 0); } char *crm_meta_name(const char *field); const char *crm_meta_value(GHashTable * hash, const char *field); int rsc_op_expected_rc(lrmd_event_data_t *event); gboolean did_rsc_op_fail(lrmd_event_data_t *event, int target_rc); char *crm_md5sum(const char *buffer); char *crm_generate_uuid(void); int crm_user_lookup(const char *name, uid_t * uid, gid_t * gid); #endif diff --git a/include/crm/common/xml.h b/include/crm/common/xml.h index b1766b1c14..89e442e838 100644 --- a/include/crm/common/xml.h +++ b/include/crm/common/xml.h @@ -1,239 +1,245 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRM_COMMON_XML__H # define CRM_COMMON_XML__H +/** + * \file + * \brief Wrappers for and extensions to libxml2 + * \ingroup core + */ + # include # include # include # include # include # include # include # include # include /* Encryption costs a LOT, don't do it unless we're hitting message limits * * For now, use 256k as the lower size, which means we can have 4 big data fields * before we hit heartbeat's message limit * * The previous limit was 10k, compressing 184 of 1071 messages accounted for 23% * of the total CPU used by the cib */ # define CRM_BZ2_BLOCKS 4 # define CRM_BZ2_WORK 20 # define CRM_BZ2_THRESHOLD 128 * 1024 # define XML_PARANOIA_CHECKS 0 gboolean add_message_xml(xmlNode * msg, const char *field, xmlNode * xml); xmlNode *get_message_xml(xmlNode * msg, const char *field); GHashTable *xml2list(xmlNode * parent); void hash2nvpair(gpointer key, gpointer value, gpointer user_data); void hash2field(gpointer key, gpointer value, gpointer user_data); void hash2metafield(gpointer key, gpointer value, gpointer user_data); void hash2smartfield(gpointer key, gpointer value, gpointer user_data); xmlDoc *getDocPtr(xmlNode * node); /* * Replacement function for xmlCopyPropList which at the very least, * doesnt work the way *I* would expect it to. * * Copy all the attributes/properties from src into target. * * Not recursive, does not return anything. * */ void copy_in_properties(xmlNode * target, xmlNode * src); void expand_plus_plus(xmlNode * target, const char *name, const char *value); void fix_plus_plus_recursive(xmlNode * target); /* * Create a node named "name" as a child of "parent" * If parent is NULL, creates an unconnected node. * * Returns the created node * */ xmlNode *create_xml_node(xmlNode * parent, const char *name); /* * Make a copy of name and value and use the copied memory to create * an attribute for node. * * If node, name or value are NULL, nothing is done. * * If name or value are an empty string, nothing is done. * * Returns FALSE on failure and TRUE on success. * */ const char *crm_xml_add(xmlNode * node, const char *name, const char *value); const char *crm_xml_replace(xmlNode * node, const char *name, const char *value); const char *crm_xml_add_int(xmlNode * node, const char *name, int value); /* * Unlink the node and set its doc pointer to NULL so free_xml() * will act appropriately */ void unlink_xml_node(xmlNode * node); /* * */ void purge_diff_markers(xmlNode * a_node); /* * Returns a deep copy of src_node * */ xmlNode *copy_xml(xmlNode * src_node); /* * Add a copy of xml_node to new_parent */ xmlNode *add_node_copy(xmlNode * new_parent, xmlNode * xml_node); int add_node_nocopy(xmlNode * parent, const char *name, xmlNode * child); /* * XML I/O Functions * * Whitespace between tags is discarded. */ xmlNode *filename2xml(const char *filename); xmlNode *stdin2xml(void); xmlNode *string2xml(const char *input); int write_xml_file(xmlNode * xml_node, const char *filename, gboolean compress); char *dump_xml_formatted(xmlNode * msg); char *dump_xml_unformatted(xmlNode * msg); /* * Diff related Functions */ xmlNode *diff_xml_object(xmlNode * left, xmlNode * right, gboolean suppress); xmlNode *subtract_xml_object(xmlNode * parent, xmlNode * left, xmlNode * right, gboolean full, const char *marker); gboolean can_prune_leaf(xmlNode * xml_node); void print_xml_diff(FILE * where, xmlNode * diff); void log_xml_diff(unsigned int log_level, xmlNode * diff, const char *function); gboolean apply_xml_diff(xmlNode * old, xmlNode * diff, xmlNode ** new); /* * Searching & Modifying */ xmlNode *find_xml_node(xmlNode * cib, const char *node_path, gboolean must_find); xmlNode *find_entity(xmlNode * parent, const char *node_name, const char *id); void xml_remove_prop(xmlNode * obj, const char *name); gboolean replace_xml_child(xmlNode * parent, xmlNode * child, xmlNode * update, gboolean delete_only); gboolean update_xml_child(xmlNode * child, xmlNode * to_update); int find_xml_children(xmlNode ** children, xmlNode * root, const char *tag, const char *field, const char *value, gboolean search_matches); int crm_element_value_int(xmlNode * data, const char *name, int *dest); char *crm_element_value_copy(xmlNode * data, const char *name); int crm_element_value_const_int(const xmlNode * data, const char *name, int *dest); const char *crm_element_value_const(const xmlNode * data, const char *name); xmlNode *get_xpath_object(const char *xpath, xmlNode * xml_obj, int error_level); xmlNode *get_xpath_object_relative(const char *xpath, xmlNode * xml_obj, int error_level); # define crm_element_name(xml) (xml)?(const char *)(xml)->name:NULL const char *crm_element_value(xmlNode * data, const char *name); void xml_validate(const xmlNode * root); gboolean xml_has_children(const xmlNode * root); /* For ABI compatability with version < 1.1.4 */ char *calculate_xml_digest(xmlNode * local_cib, gboolean sort, gboolean do_filter); char *calculate_on_disk_digest(xmlNode * local_cib); char *calculate_operation_digest(xmlNode * local_cib, const char *version); char *calculate_xml_versioned_digest(xmlNode * input, gboolean sort, gboolean do_filter, const char *version); gboolean validate_xml(xmlNode * xml_blob, const char *validation, gboolean to_logs); gboolean validate_xml_verbose(xmlNode * xml_blob); int update_validation(xmlNode ** xml_blob, int *best, gboolean transform, gboolean to_logs); int get_schema_version(const char *name); const char *get_schema_name(int version); void crm_xml_cleanup(void); static inline xmlNode * __xml_first_child(xmlNode * parent) { xmlNode *child = NULL; if (parent) { child = parent->children; while (child && child->type != XML_ELEMENT_NODE) { child = child->next; } } return child; } static inline xmlNode * __xml_next(xmlNode * child) { if (child) { child = child->next; while (child && child->type != XML_ELEMENT_NODE) { child = child->next; } } return child; } void free_xml(xmlNode * child); xmlNode *first_named_child(xmlNode * parent, const char *name); xmlNode *sorted_xml(xmlNode * input, xmlNode * parent, gboolean recursive); xmlXPathObjectPtr xpath_search(xmlNode * xml_top, const char *path); gboolean cli_config_update(xmlNode ** xml, int *best_version, gboolean to_logs); xmlNode *expand_idref(xmlNode * input, xmlNode * top); xmlNode *getXpathResult(xmlXPathObjectPtr xpathObj, int index); #endif diff --git a/include/crm/crm.h b/include/crm/crm.h index ec17da3c48..d1fe13d20a 100644 --- a/include/crm/crm.h +++ b/include/crm/crm.h @@ -1,190 +1,196 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRM__H # define CRM__H +/** + * \file + * \brief A dumping ground + * \ingroup core + */ + # include # include # include # include # undef MIN # undef MAX # include # include # define CRM_FEATURE_SET "3.0.6" # define MINIMUM_SCHEMA_VERSION "pacemaker-1.0" # define LATEST_SCHEMA_VERSION "pacemaker-"CRM_DTD_VERSION # define EOS '\0' # define DIMOF(a) ((int) (sizeof(a)/sizeof(a[0])) ) # ifndef MAX_NAME # define MAX_NAME 256 # endif # ifndef __GNUC__ # define __builtin_expect(expr, result) (expr) # endif /* Some handy macros used by the Linux kernel */ # define __likely(expr) __builtin_expect(expr, 1) # define __unlikely(expr) __builtin_expect(expr, 0) # define CRM_META "CRM_meta" extern const char *crm_system_name; /* *INDENT-OFF* */ /* Clean these up at some point, some probably should be runtime options */ # define SOCKET_LEN 1024 # define APPNAME_LEN 256 # define MAX_IPC_FAIL 5 # define MAX_IPC_DELAY 120 # define MSG_LOG 1 # define DOT_FSA_ACTIONS 1 # define DOT_ALL_FSA_INPUTS 1 /* #define FSA_TRACE 1 */ # define INFINITY_S "INFINITY" # define MINUS_INFINITY_S "-INFINITY" # define INFINITY 1000000 /* Sub-systems */ # define CRM_SYSTEM_DC "dc" # define CRM_SYSTEM_DCIB "dcib" /* The master CIB */ # define CRM_SYSTEM_CIB "cib" # define CRM_SYSTEM_CRMD "crmd" # define CRM_SYSTEM_LRMD "lrmd" # define CRM_SYSTEM_PENGINE "pengine" # define CRM_SYSTEM_TENGINE "tengine" # define CRM_SYSTEM_STONITHD "stonithd" # define CRM_SYSTEM_MCP "pacemakerd" /* Valid operations */ # define CRM_OP_NOOP "noop" # define CRM_OP_JOIN_ANNOUNCE "join_announce" # define CRM_OP_JOIN_OFFER "join_offer" # define CRM_OP_JOIN_REQUEST "join_request" # define CRM_OP_JOIN_ACKNAK "join_ack_nack" # define CRM_OP_JOIN_CONFIRM "join_confirm" # define CRM_OP_DIE "die_no_respawn" # define CRM_OP_RETRIVE_CIB "retrieve_cib" # define CRM_OP_PING "ping" # define CRM_OP_VOTE "vote" # define CRM_OP_NOVOTE "no-vote" # define CRM_OP_HELLO "hello" # define CRM_OP_HBEAT "dc_beat" # define CRM_OP_PECALC "pe_calc" # define CRM_OP_ABORT "abort" # define CRM_OP_QUIT "quit" # define CRM_OP_LOCAL_SHUTDOWN "start_shutdown" # define CRM_OP_SHUTDOWN_REQ "req_shutdown" # define CRM_OP_SHUTDOWN "do_shutdown" # define CRM_OP_FENCE "stonith" # define CRM_OP_EVENTCC "event_cc" # define CRM_OP_TEABORT "te_abort" # define CRM_OP_TEABORTED "te_abort_confirmed" /* we asked */ # define CRM_OP_TE_HALT "te_halt" # define CRM_OP_TECOMPLETE "te_complete" # define CRM_OP_TETIMEOUT "te_timeout" # define CRM_OP_TRANSITION "transition" # define CRM_OP_REGISTER "register" # define CRM_OP_DEBUG_UP "debug_inc" # define CRM_OP_DEBUG_DOWN "debug_dec" # define CRM_OP_INVOKE_LRM "lrm_invoke" # define CRM_OP_LRM_REFRESH "lrm_refresh" # define CRM_OP_LRM_QUERY "lrm_query" # define CRM_OP_LRM_DELETE "lrm_delete" # define CRM_OP_LRM_FAIL "lrm_fail" # define CRM_OP_PROBED "probe_complete" # define CRM_OP_REPROBE "probe_again" # define CRM_OP_CLEAR_FAILCOUNT "clear_failcount" # define CRM_OP_RELAXED_SET "one-or-more" # define CRM_OP_RM_NODE_CACHE "rm_node_cache" # define CRMD_JOINSTATE_DOWN "down" # define CRMD_JOINSTATE_PENDING "pending" # define CRMD_JOINSTATE_MEMBER "member" # define CRMD_JOINSTATE_NACK "banned" # define CRMD_ACTION_DELETE "delete" # define CRMD_ACTION_CANCEL "cancel" # define CRMD_ACTION_MIGRATE "migrate_to" # define CRMD_ACTION_MIGRATED "migrate_from" # define CRMD_ACTION_START "start" # define CRMD_ACTION_STARTED "running" # define CRMD_ACTION_STOP "stop" # define CRMD_ACTION_STOPPED "stopped" # define CRMD_ACTION_PROMOTE "promote" # define CRMD_ACTION_PROMOTED "promoted" # define CRMD_ACTION_DEMOTE "demote" # define CRMD_ACTION_DEMOTED "demoted" # define CRMD_ACTION_NOTIFY "notify" # define CRMD_ACTION_NOTIFIED "notified" # define CRMD_ACTION_STATUS "monitor" /* short names */ # define RSC_DELETE CRMD_ACTION_DELETE # define RSC_CANCEL CRMD_ACTION_CANCEL # define RSC_MIGRATE CRMD_ACTION_MIGRATE # define RSC_MIGRATED CRMD_ACTION_MIGRATED # define RSC_START CRMD_ACTION_START # define RSC_STARTED CRMD_ACTION_STARTED # define RSC_STOP CRMD_ACTION_STOP # define RSC_STOPPED CRMD_ACTION_STOPPED # define RSC_PROMOTE CRMD_ACTION_PROMOTE # define RSC_PROMOTED CRMD_ACTION_PROMOTED # define RSC_DEMOTE CRMD_ACTION_DEMOTE # define RSC_DEMOTED CRMD_ACTION_DEMOTED # define RSC_NOTIFY CRMD_ACTION_NOTIFY # define RSC_NOTIFIED CRMD_ACTION_NOTIFIED # define RSC_STATUS CRMD_ACTION_STATUS /* *INDENT-ON* */ typedef GList *GListPtr; # include # include # include # define crm_str_hash g_str_hash_traditional guint g_str_hash_traditional(gconstpointer v); #endif diff --git a/include/crm/error.h b/include/crm/error.h index 56c64ffe87..83e5255fd1 100644 --- a/include/crm/error.h +++ b/include/crm/error.h @@ -1,48 +1,54 @@ /* * Copyright (C) 2012 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRM_ERROR__H # define CRM_ERROR__H # include # include +/** + * \file + * \brief Error codes and asserts + * \ingroup core + */ + /* System error codes - /usr/include/asm-generic/errno.h - /usr/include/asm-generic/errno-base.h */ # define CRM_ASSERT(expr) do { \ if(__unlikely((expr) == FALSE)) { \ crm_abort(__FILE__, __PRETTY_FUNCTION__, __LINE__, #expr, TRUE, FALSE); \ } \ } while(0) # define pcmk_ok 0 # define PCMK_ERROR_OFFSET 900 /* Replacements on non-linux systems, see include/portability.h */ # define PCMK_CUSTOM_OFFSET 1000 /* Purely custom codes */ # define pcmk_err_generic 1001 # define pcmk_err_no_quorum 1002 # define pcmk_err_dtd_validation 1003 # define pcmk_err_transform_failed 1004 # define pcmk_err_old_data 1005 # define pcmk_err_diff_failed 1006 # define pcmk_err_diff_resync 1007 const char *pcmk_strerror(int rc); #endif diff --git a/include/crm/lrmd.h b/include/crm/lrmd.h index 2f230c17f7..ffbc325498 100644 --- a/include/crm/lrmd.h +++ b/include/crm/lrmd.h @@ -1,411 +1,417 @@ /* * Copyright (c) 2012 David Vossel * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ +/** + * \file + * \brief Local Resource Manager + * \ingroup lrm + */ + #ifndef LRMD__H #define LRMD__H typedef struct lrmd_s lrmd_t; typedef struct lrmd_key_value_s { char *key; char *value; struct lrmd_key_value_s *next; } lrmd_key_value_t; /* *INDENT-OFF* */ #define F_LRMD_OPERATION "lrmd_op" #define F_LRMD_CLIENTNAME "lrmd_clientname" #define F_LRMD_CLIENTID "lrmd_clientid" #define F_LRMD_CALLBACK_TOKEN "lrmd_async_id" #define F_LRMD_CALLID "lrmd_callid" #define F_LRMD_CANCEL_CALLID "lrmd_cancel_callid" #define F_LRMD_CALLOPTS "lrmd_callopt" #define F_LRMD_CALLDATA "lrmd_calldata" #define F_LRMD_RC "lrmd_rc" #define F_LRMD_EXEC_RC "lrmd_exec_rc" #define F_LRMD_OP_STATUS "lrmd_exec_op_status" #define F_LRMD_TIMEOUT "lrmd_timeout" #define F_LRMD_CLASS "lrmd_class" #define F_LRMD_PROVIDER "lrmd_provider" #define F_LRMD_TYPE "lrmd_type" #define F_LRMD_ORIGIN "lrmd_origin" #define F_LRMD_RSC_RUN_TIME "lrmd_run_time" #define F_LRMD_RSC_RCCHANGE_TIME "lrmd_rcchange_time" #define F_LRMD_RSC_EXEC_TIME "lrmd_exec_time" #define F_LRMD_RSC_QUEUE_TIME "lrmd_queue_time" #define F_LRMD_RSC_ID "lrmd_rsc_id" #define F_LRMD_RSC_ACTION "lrmd_rsc_action" #define F_LRMD_RSC_USERDATA_STR "lrmd_rsc_userdata_str" #define F_LRMD_RSC_OUTPUT "lrmd_rsc_output" #define F_LRMD_RSC_START_DELAY "lrmd_rsc_start_delay" #define F_LRMD_RSC_INTERVAL "lrmd_rsc_interval" #define F_LRMD_RSC_METADATA "lrmd_rsc_metadata_res" #define F_LRMD_RSC_DELETED "lrmd_rsc_deleted" #define F_LRMD_RSC "lrmd_rsc" #define LRMD_OP_RSC_CHK_REG "lrmd_rsc_check_register" #define LRMD_OP_RSC_REG "lrmd_rsc_register" #define LRMD_OP_RSC_EXEC "lrmd_rsc_exec" #define LRMD_OP_RSC_CANCEL "lrmd_rsc_cancel" #define LRMD_OP_RSC_UNREG "lrmd_rsc_unregister" #define LRMD_OP_RSC_INFO "lrmd_rsc_info" #define LRMD_OP_RSC_METADATA "lrmd_rsc_metadata" #define T_LRMD "lrmd" #define T_LRMD_REPLY "lrmd_reply" #define T_LRMD_NOTIFY "lrmd_notify" /* *INDENT-ON* */ lrmd_t *lrmd_api_new(void); bool lrmd_dispatch(lrmd_t *lrmd); void lrmd_api_delete(lrmd_t * lrmd); lrmd_key_value_t *lrmd_key_value_add(lrmd_key_value_t *kvp, const char *key, const char *value); /* *INDENT-OFF* */ /* Reserved for future use */ enum lrmd_call_options { lrmd_opt_none = 0x00000000, /* lrmd_opt_sync_call = 0x00000001, //Not implemented, patches welcome. */ /*! Only notify the client originating a exec() the results */ lrmd_opt_notify_orig_only = 0x00000002, /*! Drop recurring operations initiated by a client when client disconnects. * This call_option is only valid when registering a resource. */ lrmd_opt_drop_recurring = 0x00000003, /*! Only send out notifications for recurring operations whenthe result changes */ lrmd_opt_notify_changes_only = 0x00000004, }; enum lrmd_callback_event { lrmd_event_register, lrmd_event_unregister, lrmd_event_exec_complete, lrmd_event_disconnect, }; enum lrmd_exec_rc { PCMK_EXECRA_OK = 0, PCMK_EXECRA_UNKNOWN_ERROR = 1, PCMK_EXECRA_INVALID_PARAM = 2, PCMK_EXECRA_UNIMPLEMENT_FEATURE = 3, PCMK_EXECRA_INSUFFICIENT_PRIV = 4, PCMK_EXECRA_NOT_INSTALLED = 5, PCMK_EXECRA_NOT_CONFIGURED = 6, PCMK_EXECRA_NOT_RUNNING = 7, PCMK_EXECRA_RUNNING_MASTER = 8, PCMK_EXECRA_FAILED_MASTER = 9, /* For status command only */ PCMK_EXECRA_STATUS_UNKNOWN = 14, }; /* *INDENT-ON* */ typedef struct lrmd_event_data_s { /*! Type of event, register, unregister, call_completed... */ enum lrmd_callback_event type; /*! The resource this event occurred on. */ const char *rsc_id; /*! The action performed, start, stop, monitor... */ const char *op_type; /*! The userdata string given do exec() api function */ const char *user_data; /*! The client api call id associated with this event */ int call_id; /*! The operation's timeout period in ms. */ int timeout; /*! The operation's recurring interval in ms. */ int interval; /*! The operation's start delay value in ms. */ int start_delay; /*! This operation that just completed is on a deleted rsc. */ int rsc_deleted; /*! The executed ra return code */ enum lrmd_exec_rc rc; /*! The lrmd status returned for exec_complete events */ int op_status; /*! stdout from resource agent operation */ const char *output; /*! Timestamp of when op ran */ unsigned int t_run; /*! Timestamp of last rc change */ unsigned int t_rcchange; /*! Time in length op took to execute */ unsigned int exec_time; /*! Time in length spent in queue */ unsigned int queue_time; /* This is a GHashTable containing the * parameters given to the operation */ void *params; } lrmd_event_data_t; lrmd_event_data_t *lrmd_copy_event(lrmd_event_data_t *event); void lrmd_free_event(lrmd_event_data_t *event); typedef struct lrmd_rsc_info_s { char *id; char *type; char *class; char *provider; } lrmd_rsc_info_t; lrmd_rsc_info_t *lrmd_copy_rsc_info(lrmd_rsc_info_t *rsc_info); void lrmd_free_rsc_info(lrmd_rsc_info_t *rsc_info); typedef void (*lrmd_event_callback)(lrmd_event_data_t *event); typedef struct lrmd_list_s { const char *val; struct lrmd_list_s *next; } lrmd_list_t; void lrmd_list_freeall(lrmd_list_t *head); typedef struct lrmd_api_operations_s { /*! * \brief Connect from the lrmd. * * \retval 0, success * \retval negative error code on failure */ int (*connect) (lrmd_t *lrmd, const char *client_name, int *fd); /*! * \brief Disconnect from the lrmd. * * \retval 0, success * \retval negative error code on failure */ int (*disconnect)(lrmd_t *lrmd); /*! * \brief Register a resource with the lrmd. * * \note Synchronous, guaranteed to occur in daemon before function returns. * * \retval 0, success * \retval negative error code on failure */ int (*register_rsc) (lrmd_t *lrmd, const char *rsc_id, const char *class, const char *provider, const char *agent, enum lrmd_call_options options); /*! * \brief Retrieve registration info for a rsc * * \retval info on success * \retval NULL on failure */ lrmd_rsc_info_t *(*get_rsc_info) (lrmd_t *lrmd, const char *rsc_id, enum lrmd_call_options options); /*! * \brief Unregister a resource from the lrmd. * * \note All pending and recurring operations will be cancelled * automatically. * * \note Synchronous, guaranteed to occur in daemon before function returns. * * \retval 0, success * \retval -1, success, but operations are currently executing on the rsc which will * return once they are completed. * \retval negative error code on failure * */ int (*unregister_rsc) (lrmd_t *lrmd, const char *rsc_id, enum lrmd_call_options options); /*! * \brief Sets the callback to receive lrmd events on. */ void (*set_callback) (lrmd_t *lrmd, lrmd_event_callback callback); /*! * \brief Issue a command on a resource * * \note Asynchronous, command is queued in daemon on function return, but * execution of command is not synced. * * \note Operations on individual resources are guaranteed to occur * in the order the client api calls them in. * * \note Operations between different resources are not guaranteed * to occur in any specific order in relation to one another * regardless of what order the client api is called in. * \retval call_id to track async event result on success * \retval negative error code on failure */ int (*exec)(lrmd_t *lrmd, const char *rsc_id, const char *action, const char *userdata, /* userdata string given back in event notification */ int interval, /* ms */ int timeout, /* ms */ int start_delay, /* ms */ enum lrmd_call_options options, lrmd_key_value_t *params); /* ownership of params is given up to api here */ /*! * \brief Cancel a recurring command. * * \note Synchronous, guaranteed to occur in daemon before function returns. * * \note The cancel is completed async from this call. * We can be guaranteed the cancel has completed once * the callback receives an exec_complete event with * the lrmd_op_status signifying that the operation is * cancelled. * \note For each resource, cancel operations and exec operations * are processed in the order they are received. * It is safe to assume that for a single resource, a cancel * will occur in the lrmd before an exec if the client's cancel * api call occurs before the exec api call. * * It is not however safe to assume any operation on one resource will * occur before an operation on another resource regardless of * the order the client api is called in. * * \retval 0, cancel command sent. * \retval negative error code on failure */ int (*cancel)(lrmd_t *lrmd, const char *rsc_id, const char *action, int interval); /*! * \brief Get the metadata documentation for a resource. * * \note Value is returned in output. Output must be freed when set * * \retval lrmd_ok success * \retval negative error code on failure */ int (*get_metadata) (lrmd_t *lrmd, const char *class, const char *provider, const char *agent, char **output, enum lrmd_call_options options); /*! * \brief Retrieve a list of installed resource agents. * * \note if class is not provided, all known agents will be returned * \note list must be freed using lrmd_list_freeall() * * \retval num items in list on success * \retval negative error code on failure */ int (*list_agents)(lrmd_t *lrmd, lrmd_list_t **agents, const char *class, const char *provider); /*! * \brief Retrieve a list of resource agent providers * * \note When the agent is provided, only the agent's provider will be returned * \note When no agent is supplied, all providers will be returned. * \note List must be freed using lrmd_list_freeall() * * \retval num items in list on success * \retval negative error code on failure */ int (*list_ocf_providers)(lrmd_t *lrmd, const char *agent, lrmd_list_t **providers); /*! * \brief Retrieve a list of standards supported by this machine/installation * * \note List must be freed using lrmd_list_freeall() * * \retval num items in list on success * \retval negative error code on failure */ int (*list_standards)(lrmd_t *lrmd, lrmd_list_t **standards); } lrmd_api_operations_t; struct lrmd_s { lrmd_api_operations_t *cmds; void *private; }; static inline const char * lrmd_event_rc2str(enum lrmd_exec_rc rc) { switch(rc) { case PCMK_EXECRA_OK: return "ok"; case PCMK_EXECRA_UNKNOWN_ERROR: return "unknown error"; case PCMK_EXECRA_INVALID_PARAM: return "invalid parameter"; case PCMK_EXECRA_UNIMPLEMENT_FEATURE: return "unimplemented feature"; case PCMK_EXECRA_INSUFFICIENT_PRIV: return "insufficient privileges"; case PCMK_EXECRA_NOT_INSTALLED: return "not installed"; case PCMK_EXECRA_NOT_CONFIGURED: return "not configured"; case PCMK_EXECRA_NOT_RUNNING: return "not running"; case PCMK_EXECRA_RUNNING_MASTER: return "master"; case PCMK_EXECRA_FAILED_MASTER: return "master (failed)"; case PCMK_EXECRA_STATUS_UNKNOWN: return "status: unknown"; default: break; } return ""; } static inline const char * lrmd_event_type2str(enum lrmd_callback_event type) { switch (type) { case lrmd_event_register: return "register"; case lrmd_event_unregister: return "unregister"; case lrmd_event_exec_complete: return "exec_complete"; case lrmd_event_disconnect: return "disconnect"; } return "unknown"; } #endif diff --git a/include/crm/services.h b/include/crm/services.h index fed9bcb608..c5755b98a2 100644 --- a/include/crm/services.h +++ b/include/crm/services.h @@ -1,355 +1,355 @@ /* * Copyright (C) 2010 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /** * \file * \brief Services API - * \ingroup coreapi + * \ingroup core */ #ifndef __PCMK_SERVICES__ #define __PCMK_SERVICES__ #ifdef __cplusplus extern "C" { #endif #include #include #ifndef OCF_ROOT_DIR #define OCF_ROOT_DIR "/usr/lib/ocf" #endif #ifndef LSB_ROOT_DIR #define LSB_ROOT_DIR "/etc/init.d" #endif /* TODO: Autodetect these two ?*/ #ifndef SYSTEMCTL #define SYSTEMCTL "/bin/systemctl" #endif #ifndef SERVICE_SCRIPT #define SERVICE_SCRIPT "/sbin/service" #endif /* *INDENT-OFF* */ enum lsb_exitcode { PCMK_LSB_OK = 0, PCMK_LSB_UNKNOWN_ERROR = 1, PCMK_LSB_INVALID_PARAM = 2, PCMK_LSB_UNIMPLEMENT_FEATURE = 3, PCMK_LSB_INSUFFICIENT_PRIV = 4, PCMK_LSB_NOT_INSTALLED = 5, PCMK_LSB_NOT_CONFIGURED = 6, PCMK_LSB_NOT_RUNNING = 7, /* 150-199 reserved for application use */ PCMK_LSB_SIGNAL = 194, PCMK_LSB_NOT_SUPPORTED = 195, PCMK_LSB_PENDING = 196, PCMK_LSB_CANCELLED = 197, PCMK_LSB_TIMEOUT = 198, PCMK_LSB_OTHER_ERROR = 199, }; /* The return codes for the status operation are not the same for other * operatios - go figure */ enum lsb_status_exitcode { PCMK_LSB_STATUS_OK = 0, PCMK_LSB_STATUS_VAR_PID = 1, PCMK_LSB_STATUS_VAR_LOCK = 2, PCMK_LSB_STATUS_NOT_RUNNING = 3, PCMK_LSB_STATUS_NOT_INSTALLED = 4, /* 150-199 reserved for application use */ PCMK_LSB_STATUS_SIGNAL = 194, PCMK_LSB_STATUS_NOT_SUPPORTED = 195, PCMK_LSB_STATUS_PENDING = 196, PCMK_LSB_STATUS_CANCELLED = 197, PCMK_LSB_STATUS_TIMEOUT = 198, PCMK_LSB_STATUS_OTHER_ERROR = 199, }; enum ocf_exitcode { PCMK_OCF_OK = 0, PCMK_OCF_UNKNOWN_ERROR = 1, PCMK_OCF_INVALID_PARAM = 2, PCMK_OCF_UNIMPLEMENT_FEATURE = 3, PCMK_OCF_INSUFFICIENT_PRIV = 4, PCMK_OCF_NOT_INSTALLED = 5, PCMK_OCF_NOT_CONFIGURED = 6, PCMK_OCF_NOT_RUNNING = 7, PCMK_OCF_RUNNING_MASTER = 8, PCMK_OCF_FAILED_MASTER = 9, /* 150-199 reserved for application use */ PCMK_OCF_SIGNAL = 194, PCMK_OCF_NOT_SUPPORTED = 195, PCMK_OCF_PENDING = 196, PCMK_OCF_CANCELLED = 197, PCMK_OCF_TIMEOUT = 198, PCMK_OCF_OTHER_ERROR = 199, /* Keep the same codes as PCMK_LSB */ }; enum op_status { PCMK_LRM_OP_PENDING = -1, PCMK_LRM_OP_DONE, PCMK_LRM_OP_CANCELLED, PCMK_LRM_OP_TIMEOUT, PCMK_LRM_OP_NOTSUPPORTED, PCMK_LRM_OP_ERROR }; /* *INDENT-ON* */ typedef struct svc_action_private_s svc_action_private_t; typedef struct svc_action_s { char *id; char *rsc; char *action; int interval; char *standard; char *provider; char *agent; int timeout; GHashTable *params; int rc; int pid; int cancel; int status; int sequence; int expected_rc; char *stderr_data; char *stdout_data; /** * Data stored by the creator of the action. * * This may be used to hold data that is needed later on by a callback, * for example. */ void *cb_data; svc_action_private_t *opaque; } svc_action_t; /** * Get a list of files or directories in a given path * * \param[in] root full path to a directory to read * \param[in] files true to get a list of files, false for a list of directories * * \return a list of what was found. The list items are gchar *. This list _must_ * be destroyed using g_list_free_full(list, free). */ GList * get_directory_list(const char *root, gboolean files); /** * Get a list of services * * \return a list of services. The list items are gchar *. This list _must_ * be destroyed using g_list_free_full(list, free). */ GList * services_list(void); /** * Get a list of providers * * \param[in] the standard for providers to check for (such as "ocf") * * \return a list of providers. The list items are gchar *. This list _must_ * be destroyed using g_list_free_full(list, free). */ GList * resources_list_providers(const char *standard); /** * Get a list of resource agents * * \param[in] the standard for research agents to check for * (such as "ocf", "lsb", or "windows") * * \return a list of resource agents. The list items are gchar *. This list _must_ * be destroyed using g_list_free_full(list, free). */ GList * resources_list_agents(const char *standard, const char *provider); /** * Get list of available standards * * \return a list of resource standards. The list items are char *. This list _must_ * be destroyed using g_list_free_full(list, free). */ GList * resources_list_standards(void); svc_action_t * services_action_create(const char *name, const char *action, int interval /* ms */, int timeout /* ms */); /** * Create a resources action. * * \param[in] timeout the timeout in milliseconds * \param[in] interval how often to repeat this action, in milliseconds. * If this value is 0, only execute this action one time. * * \post After the call, 'params' is owned, and later free'd by the svc_action_t result */ svc_action_t * resources_action_create(const char *name, const char *standard, const char *provider, const char *agent, const char *action, int interval /* ms */, int timeout /* ms */, GHashTable *params); /** * Utilize services API to execute an arbitrary command. * * This API has useful infrastructure in place to be able to run a command * in the background and get notified via a callback when the command finishes. * * \param[in] exec command to execute * \param[in] args arguments to the command, NULL terminated * * \return a svc_action_t object, used to pass to the execute function * (services_action_sync() or services_action_async()) and is * provided to the callback. */ svc_action_t * services_action_create_generic(const char *exec, const char *args[]); void services_action_free(svc_action_t *op); gboolean services_action_sync(svc_action_t *op); /** * Run an action asynchronously. * * \param[in] op services action data * \param[in] action_callback callback for when the action completes * * \retval TRUE succesfully started execution * \retval FALSE failed to start execution, no callback will be received */ gboolean services_action_async(svc_action_t *op, void (*action_callback)(svc_action_t *)); gboolean services_action_cancel(const char *name, const char *action, int interval); static inline const char* services_lrm_status_str(enum op_status status) { switch (status) { case PCMK_LRM_OP_PENDING: return "pending"; case PCMK_LRM_OP_DONE: return "complete"; case PCMK_LRM_OP_CANCELLED: return "Cancelled"; case PCMK_LRM_OP_TIMEOUT: return "Timed Out"; case PCMK_LRM_OP_NOTSUPPORTED: return "NOT SUPPORTED"; case PCMK_LRM_OP_ERROR: return "Error"; default: return "UNKNOWN!"; } } static inline const char* services_ocf_exitcode_str(enum ocf_exitcode code) { switch (code) { case PCMK_OCF_OK: return "OCF_OK"; case PCMK_OCF_UNKNOWN_ERROR: return "OCF_UNKNOWN_ERROR"; case PCMK_OCF_INVALID_PARAM: return "OCF_INVALID_PARAM"; case PCMK_OCF_UNIMPLEMENT_FEATURE: return "OCF_UNIMPLEMENT_FEATURE"; case PCMK_OCF_INSUFFICIENT_PRIV: return "OCF_INSUFFICIENT_PRIV"; case PCMK_OCF_NOT_INSTALLED: return "OCF_NOT_INSTALLED"; case PCMK_OCF_NOT_CONFIGURED: return "OCF_NOT_CONFIGURED"; case PCMK_OCF_NOT_RUNNING: return "OCF_NOT_RUNNING"; case PCMK_OCF_RUNNING_MASTER: return "OCF_RUNNING_MASTER"; case PCMK_OCF_FAILED_MASTER: return "OCF_FAILED_MASTER"; case PCMK_OCF_SIGNAL: return "OCF_SIGNAL"; case PCMK_OCF_NOT_SUPPORTED: return "OCF_NOT_SUPPORTED"; case PCMK_OCF_PENDING: return "OCF_PENDING"; case PCMK_OCF_CANCELLED: return "OCF_CANCELLED"; case PCMK_OCF_TIMEOUT: return "OCF_TIMEOUT"; case PCMK_OCF_OTHER_ERROR: return "OCF_OTHER_ERROR"; default: return "unknown"; } } static inline enum ocf_exitcode services_get_ocf_exitcode(char *action, int lsb_exitcode) { if (action != NULL && strcmp("status", action) == 0) { switch (lsb_exitcode) { case PCMK_LSB_STATUS_OK: return PCMK_OCF_OK; case PCMK_LSB_STATUS_VAR_PID: return PCMK_OCF_NOT_RUNNING; case PCMK_LSB_STATUS_VAR_LOCK: return PCMK_OCF_NOT_RUNNING; case PCMK_LSB_STATUS_NOT_RUNNING: return PCMK_OCF_NOT_RUNNING; case PCMK_LSB_STATUS_NOT_INSTALLED: return PCMK_OCF_UNKNOWN_ERROR; default: return PCMK_OCF_UNKNOWN_ERROR; } } else if (lsb_exitcode > PCMK_LSB_NOT_RUNNING) { return PCMK_OCF_UNKNOWN_ERROR; } /* For non-status operations, the PCMK_LSB and PCMK_OCF share error code meaning * for rc <= 7 */ return (enum ocf_exitcode)lsb_exitcode; } #ifdef __cplusplus } #endif #endif /* __PCMK_SERVICES__ */ diff --git a/include/crm/stonith-ng.h b/include/crm/stonith-ng.h index 164ec65d23..f991b36da2 100644 --- a/include/crm/stonith-ng.h +++ b/include/crm/stonith-ng.h @@ -1,426 +1,433 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +/** + * \file + * \brief Fencing aka. STONITH + * \ingroup fencing + */ + #ifndef STONITH_NG__H # define STONITH_NG__H # include # include /* TO-DO: Work out how to drop this requirement */ # include #define T_STONITH_NOTIFY_DISCONNECT "st_notify_disconnect" #define T_STONITH_NOTIFY_FENCE "st_notify_fence" /* *INDENT-OFF* */ enum stonith_state { stonith_connected_command, stonith_connected_query, stonith_disconnected, }; enum stonith_call_options { st_opt_none = 0x00000000, st_opt_verbose = 0x00000001, st_opt_allow_suicide = 0x00000002, st_opt_manual_ack = 0x00000008, st_opt_discard_reply = 0x00000010, /* st_opt_all_replies = 0x00000020, */ st_opt_topology = 0x00000040, st_opt_scope_local = 0x00000100, st_opt_cs_nodeid = 0x00000200, st_opt_sync_call = 0x00001000, /*! Allow the timeout period for a callback to be adjusted * based on the time the server reports the operation will take. */ st_opt_timeout_updates = 0x00002000, /*! Only report back if operation is a success in callback */ st_opt_report_only_success = 0x00004000, }; #define stonith_default_options = stonith_none /*! Order matters here, do not change values */ enum op_state { st_query, st_exec, st_done, st_duplicate, st_failed, }; typedef struct stonith_key_value_s { char *key; char *value; struct stonith_key_value_s *next; } stonith_key_value_t; typedef struct stonith_history_s { char *target; char *action; char *origin; char *delegate; int completed; int state; struct stonith_history_s *next; } stonith_history_t; typedef struct stonith_s stonith_t; typedef struct stonith_event_s { char *id; char *type; char *message; char *operation; int result; char *origin; char *target; char *action; char *executioner; char *device; /*! The name of the client that initiated the action. */ char *client_origin; } stonith_event_t; typedef struct stonith_callback_data_s { int rc; int call_id; void *userdata; } stonith_callback_data_t; typedef struct stonith_api_operations_s { /*! * \brief Destroy the stonith api structure. */ int (*free) (stonith_t *st); /*! * \brief Connect to the local stonith daemon. * * \retval 0, success * \retval negative error code on failure */ int (*connect) (stonith_t *st, const char *name, int *stonith_fd); /*! * \brief Disconnect from the local stonith daemon. * * \retval 0, success * \retval negative error code on failure */ int (*disconnect)(stonith_t *st); /*! * \brief Remove a registered stonith device with the local stonith daemon. * * \note Synchronous, guaranteed to occur in daemon before function returns. * * \retval 0, success * \retval negative error code on failure */ int (*remove_device)( stonith_t *st, int options, const char *name); /*! * \brief Register a stonith device with the local stonith daemon. * * \note Synchronous, guaranteed to occur in daemon before function returns. * * \retval 0, success * \retval negative error code on failure */ int (*register_device)( stonith_t *st, int options, const char *id, const char *namespace, const char *agent, stonith_key_value_t *params); /*! * \brief Remove a fencing level for a specific node. * * \note This feature is not available when stonith is in standalone mode. * * \retval 0, success * \retval negative error code on failure */ int (*remove_level)( stonith_t *st, int options, const char *node, int level); /*! * \brief Register a fencing level containing the fencing devices to be used * at that level for a specific node. * * \note This feature is not available when stonith is in standalone mode. * * \retval 0, success * \retval negative error code on failure */ int (*register_level)( stonith_t *st, int options, const char *node, int level, stonith_key_value_t *device_list); /*! * \brief Get the metadata documentation for a resource. * * \note Value is returned in output. Output must be freed when set. * * \retval 0 success * \retval negative error code on failure */ int (*metadata)(stonith_t *st, int options, const char *device, const char *namespace, char **output, int timeout); /*! * \brief Retrieve a list of installed stonith agents * * \note if namespace is not provided, all known agents will be returned * \note list must be freed using stonith_key_value_freeall() * \note call_options parameter is not used, it is reserved for future use. * * \retval num items in list on success * \retval negative error code on failure */ int (*list_agents)(stonith_t *stonith, int call_options, const char *namespace, stonith_key_value_t **devices, int timeout); /*! * \brief Retrieve string listing hosts and port assignments from a local stonith device. * * \retval 0 on success * \retval negative error code on failure */ int (*list)(stonith_t *st, int options, const char *id, char **list_output, int timeout); /*! * \brief Check to see if a local stonith device is reachable * * \retval 0 on success * \retval negative error code on failure */ int (*monitor)(stonith_t *st, int options, const char *id, int timeout); /*! * \brief Check to see if a local stonith device's port is reachable * * \retval 0 on success * \retval negative error code on failure */ int (*status)(stonith_t *st, int options, const char *id, const char *port, int timeout); /*! * \brief Retrieve a list of registered stonith devices. * * \note If node is provided, only devices that can fence the node id * will be returned. * * \retval num items in list on success * \retval negative error code on failure */ int (*query)(stonith_t *st, int options, const char *node, stonith_key_value_t **devices, int timeout); /*! * \brief Issue a fencing action against a node. * * \note Possible actions are, 'on', 'off', and 'reboot'. * * \param st, stonith connection * \param options, call options * \param node, The target node to fence * \param action, The fencing action to take * \param timeout, The default per device timeout to use with each device * capable of fencing the target. * * \retval 0 success * \retval negative error code on failure. */ int (*fence)(stonith_t *st, int options, const char *node, const char *action, int timeout, int tolerance); /*! * \brief Manually confirm that a node is down. * * \retval 0 success * \retval negative error code on failure. */ int (*confirm)(stonith_t *st, int options, const char *node); /*! * \brief Retrieve a list of fencing operations that have occurred for a specific node. * * \note History is not available in standalone mode. * * \retval 0 success * \retval negative error code on failure. */ int (*history)(stonith_t *st, int options, const char *node, stonith_history_t **output, int timeout); int (*register_notification)( stonith_t *st, const char *event, void (*notify)(stonith_t *st, stonith_event_t *e)); int (*remove_notification)(stonith_t *st, const char *event); /*! * \brief Register a callback to receive the result of an async call id * * \param call_id, The call id to register the callback for. * \param timeout, The default timeout period to wait until this callback expires * \param options, Option flags, st_opt_timeout_updates and st_opt_report_only_success are the * only valid options for this function. * \param userdate, A pointer that will be handed back in the callback. * \param callback_name, Unique name given to callback * \param callback, The callback function * * \retval 0 success * \retval negative error code on failure. */ int (*register_callback)(stonith_t *st, int call_id, int timeout, int options, void *userdata, const char *callback_name, void (*callback)(stonith_t *st, stonith_callback_data_t *data)); /*! * \brief Remove a registered callback for a given call id. */ int (*remove_callback)(stonith_t *st, int call_id, bool all_callbacks); } stonith_api_operations_t; struct stonith_s { enum stonith_state state; int call_id; int call_timeout; void *private; stonith_api_operations_t *cmds; }; /* *INDENT-ON* */ /* Core functions */ stonith_t *stonith_api_new(void); void stonith_api_delete(stonith_t * st); void stonith_dump_pending_callbacks(stonith_t * st); const char *get_stonith_provider(const char *agent, const char *provider); bool stonith_dispatch(stonith_t * st); stonith_key_value_t *stonith_key_value_add(stonith_key_value_t * kvp, const char *key, const char *value); void stonith_key_value_freeall(stonith_key_value_t * kvp, int keys, int values); /* Basic helpers that allows nodes to be fenced and the history to be * queried without mainloop or the caller understanding the full API * * At least one of nodeid and uname are required */ int stonith_api_kick(int nodeid, const char *uname, int timeout, bool off); time_t stonith_api_time(int nodeid, const char *uname, bool in_progress); /* * Helpers for using the above functions without install-time dependancies * * Usage: * #include * * To turn a node off by corosync nodeid: * stonith_api_kick_helper(nodeid, 120, 1); * * To check the last fence date/time (also by nodeid): * last = stonith_api_time_helper(nodeid, 0); * * To check if fencing is in progress: * if(stonith_api_time_helper(nodeid, 1) > 0) { ... } * * eg. #include #include #include int main(int argc, char ** argv) { int rc = 0; int nodeid = 102; rc = stonith_api_time_helper(nodeid, 0); printf("%d last fenced at %s\n", nodeid, ctime(rc)); rc = stonith_api_kick_helper(nodeid, 120, 1); printf("%d fence result: %d\n", nodeid, rc); rc = stonith_api_time_helper(nodeid, 0); printf("%d last fenced at %s\n", nodeid, ctime(rc)); return 0; } */ # define STONITH_LIBRARY "libstonithd.so.2" static inline int stonith_api_kick_helper(int nodeid, int timeout, bool off) { static void *st_library = NULL; static int (*st_kick_fn) (int nodeid, const char *uname, int timeout, bool off) = NULL; if (st_library == NULL) { st_library = dlopen(STONITH_LIBRARY, RTLD_LAZY); } if (st_library && st_kick_fn == NULL) { st_kick_fn = dlsym(st_library, "stonith_api_kick"); } if (st_kick_fn == NULL) { return -ELIBACC; } return (*st_kick_fn) (nodeid, NULL, timeout, off); } static inline time_t stonith_api_time_helper(int nodeid, bool in_progress) { static void *st_library = NULL; static time_t(*st_time_fn) (int nodeid, const char *uname, bool in_progress) = NULL; if (st_library == NULL) { st_library = dlopen(STONITH_LIBRARY, RTLD_LAZY); } if (st_library && st_time_fn == NULL) { st_time_fn = dlsym(st_library, "stonith_api_time"); } if (st_time_fn == NULL) { return 0; } return (*st_time_fn) (nodeid, NULL, in_progress); } #endif diff --git a/include/doxygen.h b/include/doxygen.h new file mode 100644 index 0000000000..cafe44a99d --- /dev/null +++ b/include/doxygen.h @@ -0,0 +1,68 @@ +/* doxygen.h */ + +/* + * Copyright (C) 2006 - 2012 + * Andrew Beekhof + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef DOXYGEN__H +#define DOXYGEN__H + +/** + * \file + * \brief Fake header file that contains doxygen documentation. + * \author Andrew Beekhof + * + * The purpose of this file is to provide a file that can be used to create + * doxygen pages. It should contain _only_ comment blocks. + * + * + * \defgroup core Core API + * \defgroup date ISO-8601 Date/Time API + * \defgroup cib Configuration API + * \defgroup lrmd Local Resource Manager API + * \defgroup pengine Policy Engine API + * \defgroup fencing Fencing API + */ + +/** + * \mainpage + * Welcome to the developer documentation for The Pacemaker Project! For more + * information about Pacemaker, please visit the + * project web site. + * + * Here are some pointers on where to go from here. + * + * Using Pacemaker APIs: + * - \ref core + * - \ref date + * - \ref cib + * - \ref lrmd + * - \ref pengine + * - \ref fencing + * + * Contributing to the Pacemaker Project: + * - \ref commit_messages + * - \ref coding_guidelines + */ + +/** + * \page coding_guidelines Coding Guidelines + * \verbinclude doc/coding_guidelines.txt + */ + +#endif /* DOXYGEN__H */ diff --git a/lib/cib/cib_ops.c b/lib/cib/cib_ops.c index 66f0e20af6..1ec60f71ef 100644 --- a/lib/cib/cib_ops.c +++ b/lib/cib/cib_ops.c @@ -1,978 +1,982 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include int cib_process_query(const char *op, int options, const char *section, xmlNode * req, xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer) { xmlNode *obj_root = NULL; int result = pcmk_ok; crm_trace("Processing \"%s\" event for section=%s", op, crm_str(section)); if (options & cib_xpath) { return cib_process_xpath(op, options, section, req, input, existing_cib, result_cib, answer); } CRM_CHECK(*answer == NULL, free_xml(*answer)); *answer = NULL; if (safe_str_eq(XML_CIB_TAG_SECTION_ALL, section)) { section = NULL; } obj_root = get_object_root(section, existing_cib); if (obj_root == NULL) { result = -ENXIO; } else { *answer = obj_root; } if (result == pcmk_ok && *answer == NULL) { crm_err("Error creating query response"); result = -ENOMSG; } return result; } int cib_process_erase(const char *op, int options, const char *section, xmlNode * req, xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer) { int result = pcmk_ok; crm_trace("Processing \"%s\" event", op); *answer = NULL; free_xml(*result_cib); *result_cib = createEmptyCib(); copy_in_properties(*result_cib, existing_cib); cib_update_counter(*result_cib, XML_ATTR_GENERATION, FALSE); return result; } int cib_process_upgrade(const char *op, int options, const char *section, xmlNode * req, xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer) { int rc = 0; int new_version = 0; int current_version = 0; const char *value = crm_element_value(existing_cib, XML_ATTR_VALIDATION);; *answer = NULL; crm_trace("Processing \"%s\" event", op); if (value != NULL) { current_version = get_schema_version(value); } rc = update_validation(result_cib, &new_version, TRUE, TRUE); if (new_version > current_version) { return pcmk_ok; } return rc; } int cib_process_bump(const char *op, int options, const char *section, xmlNode * req, xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer) { int result = pcmk_ok; crm_trace("Processing \"%s\" event for epoch=%s", op, crm_str(crm_element_value(existing_cib, XML_ATTR_GENERATION))); *answer = NULL; cib_update_counter(*result_cib, XML_ATTR_GENERATION, FALSE); return result; } int cib_update_counter(xmlNode * xml_obj, const char *field, gboolean reset) { char *new_value = NULL; char *old_value = NULL; int int_value = -1; if (reset == FALSE && crm_element_value(xml_obj, field) != NULL) { old_value = crm_element_value_copy(xml_obj, field); } if (old_value != NULL) { new_value = calloc(1, 128); int_value = atoi(old_value); sprintf(new_value, "%d", ++int_value); } else { new_value = strdup("1"); } crm_trace("%s %d(%s)->%s", field, int_value, crm_str(old_value), crm_str(new_value)); crm_xml_add(xml_obj, field, new_value); free(new_value); free(old_value); return pcmk_ok; } int cib_process_replace(const char *op, int options, const char *section, xmlNode * req, xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer) { const char *tag = NULL; int result = pcmk_ok; crm_trace("Processing \"%s\" event for section=%s", op, crm_str(section)); if (options & cib_xpath) { return cib_process_xpath(op, options, section, req, input, existing_cib, result_cib, answer); } *answer = NULL; if (input == NULL) { return -EINVAL; } tag = crm_element_name(input); if (safe_str_eq(XML_CIB_TAG_SECTION_ALL, section)) { section = NULL; } else if (safe_str_eq(tag, section)) { section = NULL; } if (safe_str_eq(tag, XML_TAG_CIB)) { int updates = 0; int epoch = 0; int admin_epoch = 0; int replace_updates = 0; int replace_epoch = 0; int replace_admin_epoch = 0; const char *reason = NULL; const char *peer = crm_element_value(req, F_ORIG); const char *digest = crm_element_value(req, XML_ATTR_DIGEST); if(digest) { const char *version = crm_element_value(req, XML_ATTR_CRM_VERSION); char *digest_verify = calculate_xml_versioned_digest(input, FALSE, TRUE, version?version:CRM_FEATURE_SET); if(safe_str_neq(digest_verify, digest)) { crm_err("Digest mis-match on replace from %s: %s vs. %s (expected)", peer, digest_verify, digest); reason = "digest mismatch"; } else { crm_info("Digest matched on replace from %s: %s", peer, digest); } free(digest_verify); } else { crm_trace("No digest to verify"); } cib_version_details(existing_cib, &admin_epoch, &epoch, &updates); cib_version_details(input, &replace_admin_epoch, &replace_epoch, &replace_updates); if (replace_admin_epoch < admin_epoch) { reason = XML_ATTR_GENERATION_ADMIN; } else if (replace_admin_epoch > admin_epoch) { /* no more checks */ } else if (replace_epoch < epoch) { reason = XML_ATTR_GENERATION; } else if (replace_epoch > epoch) { /* no more checks */ } else if (replace_updates < updates) { reason = XML_ATTR_NUMUPDATES; } if (reason != NULL) { crm_warn("Replacement %d.%d.%d from %s not applied to %d.%d.%d:" " current %s is greater than the replacement", replace_admin_epoch, replace_epoch, replace_updates, peer, admin_epoch, epoch, updates, reason); result = -pcmk_err_old_data; } else { crm_info("Replaced %d.%d.%d with %d.%d.%d from %s", admin_epoch, epoch, updates, replace_admin_epoch, replace_epoch, replace_updates, peer); } free_xml(*result_cib); *result_cib = copy_xml(input); } else { xmlNode *obj_root = NULL; gboolean ok = TRUE; obj_root = get_object_root(section, *result_cib); ok = replace_xml_child(NULL, obj_root, input, FALSE); if (ok == FALSE) { crm_trace("No matching object to replace"); result = -ENXIO; } } return result; } int cib_process_delete(const char *op, int options, const char *section, xmlNode * req, xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer) { xmlNode *obj_root = NULL; crm_trace("Processing \"%s\" event", op); if (options & cib_xpath) { return cib_process_xpath(op, options, section, req, input, existing_cib, result_cib, answer); } if (input == NULL) { crm_err("Cannot perform modification with no data"); return -EINVAL; } obj_root = get_object_root(section, *result_cib); if (replace_xml_child(NULL, obj_root, input, TRUE) == FALSE) { crm_trace("No matching object to delete"); } return pcmk_ok; } int cib_process_modify(const char *op, int options, const char *section, xmlNode * req, xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer) { xmlNode *obj_root = NULL; crm_trace("Processing \"%s\" event", op); if (options & cib_xpath) { return cib_process_xpath(op, options, section, req, input, existing_cib, result_cib, answer); } if (input == NULL) { crm_err("Cannot perform modification with no data"); return -EINVAL; } obj_root = get_object_root(section, *result_cib); if (obj_root == NULL) { xmlNode *tmp_section = NULL; const char *path = get_object_parent(section); if (path == NULL) { return -EINVAL; } tmp_section = create_xml_node(NULL, section); cib_process_xpath(CIB_OP_CREATE, 0, path, NULL, tmp_section, NULL, result_cib, answer); free_xml(tmp_section); obj_root = get_object_root(section, *result_cib); } CRM_CHECK(obj_root != NULL, return -EINVAL); if (update_xml_child(obj_root, input) == FALSE) { if (options & cib_can_create) { add_node_copy(obj_root, input); } else { return -ENXIO; } } return pcmk_ok; } static int update_cib_object(xmlNode * parent, xmlNode * update) { int result = pcmk_ok; xmlNode *target = NULL; xmlNode *a_child = NULL; const char *replace = NULL; const char *object_id = NULL; const char *object_name = NULL; CRM_CHECK(update != NULL, return -EINVAL); CRM_CHECK(parent != NULL, return -EINVAL); object_name = crm_element_name(update); CRM_CHECK(object_name != NULL, return -EINVAL); object_id = ID(update); crm_trace("Processing: <%s id=%s>", crm_str(object_name), crm_str(object_id)); if (object_id == NULL) { /* placeholder object */ target = find_xml_node(parent, object_name, FALSE); } else { target = find_entity(parent, object_name, object_id); } if (target == NULL) { target = create_xml_node(parent, object_name); } crm_trace("Found node <%s id=%s> to update", crm_str(object_name), crm_str(object_id)); replace = crm_element_value(update, XML_CIB_ATTR_REPLACE); if (replace != NULL) { xmlNode *remove = NULL; int last = 0, lpc = 0, len = 0; len = strlen(replace); while (lpc <= len) { if (replace[lpc] == ',' || replace[lpc] == 0) { char *replace_item = NULL; if (last == lpc) { /* nothing to do */ last = lpc + 1; goto incr; } replace_item = calloc(1, lpc - last + 1); memcpy(replace_item, replace + last, lpc - last); remove = find_xml_node(target, replace_item, FALSE); if (remove != NULL) { crm_trace("Replacing node <%s> in <%s>", replace_item, crm_element_name(target)); free_xml(remove); remove = NULL; } free(replace_item); last = lpc + 1; } incr: lpc++; } xml_remove_prop(update, XML_CIB_ATTR_REPLACE); xml_remove_prop(target, XML_CIB_ATTR_REPLACE); } copy_in_properties(target, update); crm_trace("Processing children of <%s id=%s>", crm_str(object_name), crm_str(object_id)); for (a_child = __xml_first_child(update); a_child != NULL; a_child = __xml_next(a_child)) { int tmp_result = 0; crm_trace("Updating child <%s id=%s>", crm_element_name(a_child), ID(a_child)); tmp_result = update_cib_object(target, a_child); /* only the first error is likely to be interesting */ if (tmp_result != pcmk_ok) { crm_err("Error updating child <%s id=%s>", crm_element_name(a_child), ID(a_child)); if (result == pcmk_ok) { result = tmp_result; } } } crm_trace("Finished with <%s id=%s>", crm_str(object_name), crm_str(object_id)); return result; } static int add_cib_object(xmlNode * parent, xmlNode * new_obj) { int result = pcmk_ok; const char *object_name = NULL; const char *object_id = NULL; xmlNode *equiv_node = NULL; if (new_obj != NULL) { object_name = crm_element_name(new_obj); } object_id = crm_element_value(new_obj, XML_ATTR_ID); crm_trace("Processing: <%s id=%s>", crm_str(object_name), crm_str(object_id)); if (new_obj == NULL || object_name == NULL) { result = -EINVAL; } else if (parent == NULL) { result = -EINVAL; } else if (object_id == NULL) { /* placeholder object */ equiv_node = find_xml_node(parent, object_name, FALSE); } else { equiv_node = find_entity(parent, object_name, object_id); } if (result != pcmk_ok) { ; /* do nothing */ } else if (equiv_node != NULL) { result = -ENOTUNIQ; } else { result = update_cib_object(parent, new_obj); } return result; } int cib_process_create(const char *op, int options, const char *section, xmlNode * req, xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer) { xmlNode *failed = NULL; int result = pcmk_ok; xmlNode *update_section = NULL; crm_trace("Processing \"%s\" event for section=%s", op, crm_str(section)); if (safe_str_eq(XML_CIB_TAG_SECTION_ALL, section)) { section = NULL; } else if (safe_str_eq(XML_TAG_CIB, section)) { section = NULL; } else if (safe_str_eq(crm_element_name(input), XML_TAG_CIB)) { section = NULL; } CRM_CHECK(strcasecmp(CIB_OP_CREATE, op) == 0, return -EINVAL); if (input == NULL) { crm_err("Cannot perform modification with no data"); return -EINVAL; } if (section == NULL) { return cib_process_modify(op, options, section, req, input, existing_cib, result_cib, answer); } failed = create_xml_node(NULL, XML_TAG_FAILED); update_section = get_object_root(section, *result_cib); if (safe_str_eq(crm_element_name(input), section)) { xmlNode *a_child = NULL; for (a_child = __xml_first_child(input); a_child != NULL; a_child = __xml_next(a_child)) { result = add_cib_object(update_section, a_child); if (update_results(failed, a_child, op, result)) { break; } } } else { result = add_cib_object(update_section, input); update_results(failed, input, op, result); } if (xml_has_children(failed)) { CRM_CHECK(result != pcmk_ok, result = -EINVAL); } if (result != pcmk_ok) { crm_log_xml_err(failed, "CIB Update failures"); *answer = failed; } else { free_xml(failed); } return result; } int cib_process_diff(const char *op, int options, const char *section, xmlNode * req, xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer) { unsigned int log_level = LOG_DEBUG; const char *reason = NULL; gboolean apply_diff = TRUE; int result = pcmk_ok; int this_updates = 0; int this_epoch = 0; int this_admin_epoch = 0; int diff_add_updates = 0; int diff_add_epoch = 0; int diff_add_admin_epoch = 0; int diff_del_updates = 0; int diff_del_epoch = 0; int diff_del_admin_epoch = 0; - const char *originator = crm_element_value(req, F_ORIG); + const char *originator = NULL; + + if (req != NULL) { + originator = crm_element_value(req, F_ORIG); + } crm_trace("Processing \"%s\" event", op); cib_diff_version_details(input, &diff_add_admin_epoch, &diff_add_epoch, &diff_add_updates, &diff_del_admin_epoch, &diff_del_epoch, &diff_del_updates); crm_element_value_int(existing_cib, XML_ATTR_GENERATION, &this_epoch); crm_element_value_int(existing_cib, XML_ATTR_NUMUPDATES, &this_updates); crm_element_value_int(existing_cib, XML_ATTR_GENERATION_ADMIN, &this_admin_epoch); if (this_epoch < 0) { this_epoch = 0; } if (this_updates < 0) { this_updates = 0; } if (this_admin_epoch < 0) { this_admin_epoch = 0; } if (diff_del_admin_epoch == diff_add_admin_epoch && diff_del_epoch == diff_add_epoch && diff_del_updates == diff_add_updates) { if (options & cib_force_diff) { apply_diff = FALSE; log_level = LOG_ERR; reason = "+ and - versions in the diff did not change in global update"; crm_log_xml_warn(input, "Bad global update"); } else if (diff_add_admin_epoch == -1 && diff_add_epoch == -1 && diff_add_updates == -1) { diff_add_epoch = this_epoch; diff_add_updates = this_updates + 1; diff_add_admin_epoch = this_admin_epoch; diff_del_epoch = this_epoch; diff_del_updates = this_updates; diff_del_admin_epoch = this_admin_epoch; } else { apply_diff = FALSE; log_level = LOG_ERR; reason = "+ and - versions in the diff did not change"; log_cib_diff(LOG_ERR, input, __FUNCTION__); } } if (apply_diff && diff_del_admin_epoch > this_admin_epoch) { result = -pcmk_err_diff_resync; apply_diff = FALSE; log_level = LOG_INFO; reason = "current \"" XML_ATTR_GENERATION_ADMIN "\" is less than required"; } else if (apply_diff && diff_del_admin_epoch < this_admin_epoch) { apply_diff = FALSE; log_level = LOG_WARNING; reason = "current \"" XML_ATTR_GENERATION_ADMIN "\" is greater than required"; } else if (apply_diff && diff_del_epoch > this_epoch) { result = -pcmk_err_diff_resync; apply_diff = FALSE; log_level = LOG_INFO; reason = "current \"" XML_ATTR_GENERATION "\" is less than required"; } else if (apply_diff && diff_del_epoch < this_epoch) { apply_diff = FALSE; log_level = LOG_WARNING; reason = "current \"" XML_ATTR_GENERATION "\" is greater than required"; } else if (apply_diff && diff_del_updates > this_updates) { result = -pcmk_err_diff_resync; apply_diff = FALSE; log_level = LOG_INFO; reason = "current \"" XML_ATTR_NUMUPDATES "\" is less than required"; } else if (apply_diff && diff_del_updates < this_updates) { apply_diff = FALSE; log_level = LOG_WARNING; reason = "current \"" XML_ATTR_NUMUPDATES "\" is greater than required"; } if (apply_diff) { free_xml(*result_cib); *result_cib = NULL; if (apply_xml_diff(existing_cib, input, result_cib) == FALSE) { log_level = LOG_NOTICE; reason = "Failed application of an update diff"; if (options & cib_force_diff) { result = -pcmk_err_diff_resync; } } } if (reason != NULL) { do_crm_log(log_level, "Diff %d.%d.%d -> %d.%d.%d from %s not applied to %d.%d.%d: %s", diff_del_admin_epoch, diff_del_epoch, diff_del_updates, diff_add_admin_epoch, diff_add_epoch, diff_add_updates, originator?originator:"local", this_admin_epoch, this_epoch, this_updates, reason); crm_log_xml_trace(input, "Discarded diff"); if (result == pcmk_ok) { result = -pcmk_err_diff_failed; } } else if (apply_diff) { crm_trace("Diff %d.%d.%d -> %d.%d.%d from %s was applied to %d.%d.%d", diff_del_admin_epoch, diff_del_epoch, diff_del_updates, diff_add_admin_epoch, diff_add_epoch, diff_add_updates, originator?originator:"local", this_admin_epoch, this_epoch, this_updates); } return result; } gboolean apply_cib_diff(xmlNode * old, xmlNode * diff, xmlNode ** new) { gboolean result = TRUE; const char *value = NULL; int this_updates = 0; int this_epoch = 0; int this_admin_epoch = 0; int diff_add_updates = 0; int diff_add_epoch = 0; int diff_add_admin_epoch = 0; int diff_del_updates = 0; int diff_del_epoch = 0; int diff_del_admin_epoch = 0; CRM_CHECK(diff != NULL, return FALSE); CRM_CHECK(old != NULL, return FALSE); value = crm_element_value(old, XML_ATTR_GENERATION_ADMIN); this_admin_epoch = crm_parse_int(value, "0"); crm_trace("%s=%d (%s)", XML_ATTR_GENERATION_ADMIN, this_admin_epoch, value); value = crm_element_value(old, XML_ATTR_GENERATION); this_epoch = crm_parse_int(value, "0"); crm_trace("%s=%d (%s)", XML_ATTR_GENERATION, this_epoch, value); value = crm_element_value(old, XML_ATTR_NUMUPDATES); this_updates = crm_parse_int(value, "0"); crm_trace("%s=%d (%s)", XML_ATTR_NUMUPDATES, this_updates, value); cib_diff_version_details(diff, &diff_add_admin_epoch, &diff_add_epoch, &diff_add_updates, &diff_del_admin_epoch, &diff_del_epoch, &diff_del_updates); value = NULL; if (result && diff_del_admin_epoch != this_admin_epoch) { value = XML_ATTR_GENERATION_ADMIN; result = FALSE; crm_trace("%s=%d", value, diff_del_admin_epoch); } else if (result && diff_del_epoch != this_epoch) { value = XML_ATTR_GENERATION; result = FALSE; crm_trace("%s=%d", value, diff_del_epoch); } else if (result && diff_del_updates != this_updates) { value = XML_ATTR_NUMUPDATES; result = FALSE; crm_trace("%s=%d", value, diff_del_updates); } if (result) { xmlNode *tmp = NULL; xmlNode *diff_copy = copy_xml(diff); tmp = find_xml_node(diff_copy, "diff-removed", TRUE); if (tmp != NULL) { xml_remove_prop(tmp, XML_ATTR_GENERATION_ADMIN); xml_remove_prop(tmp, XML_ATTR_GENERATION); xml_remove_prop(tmp, XML_ATTR_NUMUPDATES); } tmp = find_xml_node(diff_copy, "diff-added", TRUE); if (tmp != NULL) { xml_remove_prop(tmp, XML_ATTR_GENERATION_ADMIN); xml_remove_prop(tmp, XML_ATTR_GENERATION); xml_remove_prop(tmp, XML_ATTR_NUMUPDATES); } result = apply_xml_diff(old, diff_copy, new); free_xml(diff_copy); } else { crm_err("target and diff %s values didnt match", value); } return result; } gboolean cib_config_changed(xmlNode * last, xmlNode * next, xmlNode ** diff) { gboolean config_changes = FALSE; xmlXPathObject *xpathObj = NULL; CRM_ASSERT(diff != NULL); if (last != NULL && next != NULL) { *diff = diff_xml_object(last, next, FALSE); } if (*diff == NULL) { goto done; } xpathObj = xpath_search(*diff, "//" XML_CIB_TAG_CONFIGURATION); if (xpathObj && xpathObj->nodesetval->nodeNr > 0) { config_changes = TRUE; goto done; } else if (xpathObj) { xmlXPathFreeObject(xpathObj); } /* * Do not check XML_TAG_DIFF_ADDED "//" XML_TAG_CIB * This always contains every field and would produce a false positive * every time if the checked value existed */ xpathObj = xpath_search(*diff, "//" XML_TAG_DIFF_REMOVED "//" XML_TAG_CIB); if (xpathObj) { int lpc = 0, max = xpathObj->nodesetval->nodeNr; for (lpc = 0; lpc < max; lpc++) { xmlNode *top = getXpathResult(xpathObj, lpc); if (crm_element_value(top, XML_ATTR_GENERATION) != NULL) { config_changes = TRUE; goto done; } if (crm_element_value(top, XML_ATTR_GENERATION_ADMIN) != NULL) { config_changes = TRUE; goto done; } if (crm_element_value(top, XML_ATTR_VALIDATION) != NULL) { config_changes = TRUE; goto done; } if (crm_element_value(top, XML_ATTR_CRM_VERSION) != NULL) { config_changes = TRUE; goto done; } if (crm_element_value(top, "remote-clear-port") != NULL) { config_changes = TRUE; goto done; } if (crm_element_value(top, "remote-tls-port") != NULL) { config_changes = TRUE; goto done; } } } done: if (xpathObj) { xmlXPathFreeObject(xpathObj); } return config_changes; } xmlNode * diff_cib_object(xmlNode * old_cib, xmlNode * new_cib, gboolean suppress) { char *digest = NULL; xmlNode *diff = NULL; const char *version = crm_element_value(new_cib, XML_ATTR_CRM_VERSION); gboolean changed = cib_config_changed(old_cib, new_cib, &diff); fix_cib_diff(old_cib, new_cib, diff, changed); digest = calculate_xml_versioned_digest(new_cib, FALSE, TRUE, version); crm_xml_add(diff, XML_ATTR_DIGEST, digest); free(digest); return diff; } int cib_process_xpath(const char *op, int options, const char *section, xmlNode * req, xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer) { int lpc = 0; int max = 0; int rc = pcmk_ok; gboolean is_query = safe_str_eq(op, CIB_OP_QUERY); xmlXPathObjectPtr xpathObj = NULL; crm_trace("Processing \"%s\" event", op); if (is_query) { xpathObj = xpath_search(existing_cib, section); } else { xpathObj = xpath_search(*result_cib, section); } if (xpathObj != NULL && xpathObj->nodesetval != NULL) { max = xpathObj->nodesetval->nodeNr; } if (max < 1 && safe_str_eq(op, CIB_OP_DELETE)) { crm_debug("%s was already removed", section); } else if (max < 1) { crm_debug("%s: %s does not exist", op, section); rc = -ENXIO; } else if (is_query) { if (max > 1) { *answer = create_xml_node(NULL, "xpath-query"); } } for (lpc = 0; lpc < max; lpc++) { xmlChar *path = NULL; xmlNode *match = getXpathResult(xpathObj, lpc); if (match == NULL) { continue; } path = xmlGetNodePath(match); crm_debug("Processing %s op for %s (%s)", op, section, path); free(path); if (safe_str_eq(op, CIB_OP_DELETE)) { free_xml(match); if ((options & cib_multiple) == 0) { break; } } else if (safe_str_eq(op, CIB_OP_MODIFY)) { if (update_xml_child(match, input) == FALSE) { rc = -ENXIO; } else if ((options & cib_multiple) == 0) { break; } } else if (safe_str_eq(op, CIB_OP_CREATE)) { add_node_copy(match, input); break; } else if (safe_str_eq(op, CIB_OP_QUERY)) { if (options & cib_no_children) { const char *tag = TYPE(match); xmlNode *shallow = create_xml_node(*answer, tag); copy_in_properties(shallow, match); if (*answer == NULL) { *answer = shallow; } } else if (*answer) { add_node_copy(*answer, match); } else { *answer = match; } } else if (safe_str_eq(op, CIB_OP_REPLACE)) { xmlNode *parent = match->parent; free_xml(match); if (input != NULL) { add_node_copy(parent, input); } if ((options & cib_multiple) == 0) { break; } } } if (xpathObj) { xmlXPathFreeObject(xpathObj); } return rc; } /* remove this function */ gboolean update_results(xmlNode * failed, xmlNode * target, const char *operation, int return_code) { xmlNode *xml_node = NULL; gboolean was_error = FALSE; const char *error_msg = NULL; if (return_code != pcmk_ok) { error_msg = pcmk_strerror(return_code); was_error = TRUE; xml_node = create_xml_node(failed, XML_FAIL_TAG_CIB); add_node_copy(xml_node, target); crm_xml_add(xml_node, XML_FAILCIB_ATTR_ID, ID(target)); crm_xml_add(xml_node, XML_FAILCIB_ATTR_OBJTYPE, TYPE(target)); crm_xml_add(xml_node, XML_FAILCIB_ATTR_OP, operation); crm_xml_add(xml_node, XML_FAILCIB_ATTR_REASON, error_msg); crm_warn("Action %s failed: %s (cde=%d)", operation, error_msg, return_code); } return was_error; } diff --git a/lib/common/ipc.c b/lib/common/ipc.c index d5d4957dfd..b6558abad0 100644 --- a/lib/common/ipc.c +++ b/lib/common/ipc.c @@ -1,679 +1,692 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include struct crm_ipc_request_header { struct qb_ipc_request_header qb; uint32_t flags; }; static char * generateReference(const char *custom1, const char *custom2) { static uint ref_counter = 0; const char *local_cust1 = custom1; const char *local_cust2 = custom2; int reference_len = 4; char *since_epoch = NULL; reference_len += 20; /* too big */ reference_len += 40; /* too big */ if (local_cust1 == NULL) { local_cust1 = "_empty_"; } reference_len += strlen(local_cust1); if (local_cust2 == NULL) { local_cust2 = "_empty_"; } reference_len += strlen(local_cust2); since_epoch = calloc(1, reference_len); if (since_epoch != NULL) { sprintf(since_epoch, "%s-%s-%ld-%u", local_cust1, local_cust2, (unsigned long)time(NULL), ref_counter++); } return since_epoch; } xmlNode * create_request_adv(const char *task, xmlNode * msg_data, const char *host_to, const char *sys_to, const char *sys_from, const char *uuid_from, const char *origin) { char *true_from = NULL; xmlNode *request = NULL; char *reference = generateReference(task, sys_from); if (uuid_from != NULL) { true_from = generate_hash_key(sys_from, uuid_from); } else if (sys_from != NULL) { true_from = strdup(sys_from); } else { crm_err("No sys from specified"); } /* host_from will get set for us if necessary by CRMd when routed */ request = create_xml_node(NULL, __FUNCTION__); crm_xml_add(request, F_CRM_ORIGIN, origin); crm_xml_add(request, F_TYPE, T_CRM); crm_xml_add(request, F_CRM_VERSION, CRM_FEATURE_SET); crm_xml_add(request, F_CRM_MSG_TYPE, XML_ATTR_REQUEST); crm_xml_add(request, XML_ATTR_REFERENCE, reference); crm_xml_add(request, F_CRM_TASK, task); crm_xml_add(request, F_CRM_SYS_TO, sys_to); crm_xml_add(request, F_CRM_SYS_FROM, true_from); /* HOSTTO will be ignored if it is to the DC anyway. */ if (host_to != NULL && strlen(host_to) > 0) { crm_xml_add(request, F_CRM_HOST_TO, host_to); } if (msg_data != NULL) { add_message_xml(request, F_CRM_DATA, msg_data); } free(reference); free(true_from); return request; } /* * This method adds a copy of xml_response_data */ xmlNode * create_reply_adv(xmlNode * original_request, xmlNode * xml_response_data, const char *origin) { xmlNode *reply = NULL; const char *host_from = crm_element_value(original_request, F_CRM_HOST_FROM); const char *sys_from = crm_element_value(original_request, F_CRM_SYS_FROM); const char *sys_to = crm_element_value(original_request, F_CRM_SYS_TO); const char *type = crm_element_value(original_request, F_CRM_MSG_TYPE); const char *operation = crm_element_value(original_request, F_CRM_TASK); const char *crm_msg_reference = crm_element_value(original_request, XML_ATTR_REFERENCE); if (type == NULL) { crm_err("Cannot create new_message," " no message type in original message"); CRM_ASSERT(type != NULL); return NULL; #if 0 } else if (strcasecmp(XML_ATTR_REQUEST, type) != 0) { crm_err("Cannot create new_message," " original message was not a request"); return NULL; #endif } reply = create_xml_node(NULL, __FUNCTION__); crm_xml_add(reply, F_CRM_ORIGIN, origin); crm_xml_add(reply, F_TYPE, T_CRM); crm_xml_add(reply, F_CRM_VERSION, CRM_FEATURE_SET); crm_xml_add(reply, F_CRM_MSG_TYPE, XML_ATTR_RESPONSE); crm_xml_add(reply, XML_ATTR_REFERENCE, crm_msg_reference); crm_xml_add(reply, F_CRM_TASK, operation); /* since this is a reply, we reverse the from and to */ crm_xml_add(reply, F_CRM_SYS_TO, sys_from); crm_xml_add(reply, F_CRM_SYS_FROM, sys_to); /* HOSTTO will be ignored if it is to the DC anyway. */ if (host_from != NULL && strlen(host_from) > 0) { crm_xml_add(reply, F_CRM_HOST_TO, host_from); } if (xml_response_data != NULL) { add_message_xml(reply, F_CRM_DATA, xml_response_data); } return reply; } /* Libqb based IPC */ /* Server... */ int crm_ipcs_client_pid(qb_ipcs_connection_t *c) { struct qb_ipcs_connection_stats stats; stats.client_pid = 0; qb_ipcs_connection_stats_get(c, &stats, 0); return stats.client_pid; } xmlNode * crm_ipcs_recv(qb_ipcs_connection_t *c, void *data, size_t size, uint32_t *id, uint32_t *flags) { char *text = ((char*)data) + sizeof(struct crm_ipc_request_header); crm_trace("Received %.200s", text); if(id) { *id = ((struct qb_ipc_request_header*)data)->id; } if(flags) { *flags = ((struct crm_ipc_request_header*)data)->flags; } return string2xml(text); } ssize_t crm_ipcs_send(qb_ipcs_connection_t *c, uint32_t request, xmlNode *message, enum crm_ipc_server_flags flags) { int rc; int lpc = 0; int retries = 40; int level = LOG_CRIT; struct iovec iov[2]; static uint32_t id = 1; const char *type = "Response"; struct qb_ipc_response_header header; char *buffer = dump_xml_unformatted(message); struct timespec delay = { 0, 250000000 }; /* 250ms */ memset(&iov, 0, 2 * sizeof(struct iovec)); iov[0].iov_len = sizeof(struct qb_ipc_response_header); iov[0].iov_base = &header; iov[1].iov_len = 1 + strlen(buffer); iov[1].iov_base = buffer; if(flags & crm_ipc_server_event) { header.id = id++; /* We don't really use it, but doesn't hurt to set one */ } else { CRM_LOG_ASSERT (request != 0); header.id = request; /* Replying to a specific request */ } header.error = 0; /* unused */ header.size = iov[0].iov_len + iov[1].iov_len; if(flags & crm_ipc_server_error) { retries = 20; level = LOG_ERR; } else if(flags & crm_ipc_server_info) { retries = 10; level = LOG_INFO; } while(lpc < retries) { if(flags & crm_ipc_server_event) { type = "Event"; rc = qb_ipcs_event_sendv(c, iov, 2); if(rc == -EPIPE || rc == -ENOTCONN) { crm_trace("Client %p disconnected", c); level = LOG_INFO; } } else { rc = qb_ipcs_response_sendv(c, iov, 2); } if(rc != -EAGAIN) { break; } lpc++; crm_debug("Attempting resend %d of %s %d (%d bytes) to %p[%d]: %.120s", lpc, type, header.id, header.size, c, crm_ipcs_client_pid(c), buffer); nanosleep(&delay, NULL); } if(rc < header.size) { struct qb_ipcs_connection_stats_2 *stats = qb_ipcs_connection_stats_get_2(c, 0); do_crm_log(level, "%s %d failed, size=%d, to=%p[%d], queue=%d, rc=%d: %.120s", type, header.id, header.size, c, stats->client_pid, stats->event_q_length, rc, buffer); free(stats); } else { crm_trace("%s %d sent, %d bytes to %p[%d]: %.120s", type, header.id, rc, c, crm_ipcs_client_pid(c), buffer); } free(buffer); return rc; } void crm_ipcs_send_ack( qb_ipcs_connection_t *c, uint32_t request, const char *tag, const char *function, int line) { xmlNode *ack = create_xml_node(NULL, tag); crm_xml_add(ack, "function", function); crm_xml_add_int(ack, "line", line); crm_ipcs_send(c, request, ack, FALSE); free_xml(ack); } /* Client... */ #define MIN_MSG_SIZE 12336 /* sizeof(struct qb_ipc_connection_response) */ #define MAX_MSG_SIZE 20*1024 struct crm_ipc_s { struct pollfd pfd; int buf_size; int msg_size; int need_reply; char *buffer; char *name; qb_ipcc_connection_t *ipc; }; static int pick_ipc_buffer(int max) { const char *env = getenv("PCMK_ipc_buffer"); if(env) { max = crm_parse_int(env, "0"); } if(max <= 0) { max = MAX_MSG_SIZE; } if(max < MIN_MSG_SIZE) { max = MIN_MSG_SIZE; } crm_trace("Using max message size of %d", max); return max; } crm_ipc_t * crm_ipc_new(const char *name, size_t max_size) { crm_ipc_t *client = NULL; client = calloc(1, sizeof(crm_ipc_t)); client->name = strdup(name); client->buf_size = pick_ipc_buffer(max_size); client->buffer = malloc(client->buf_size); client->pfd.fd = -1; client->pfd.events = POLLIN; client->pfd.revents = 0; return client; } bool crm_ipc_connect(crm_ipc_t *client) { client->need_reply = FALSE; client->ipc = qb_ipcc_connect(client->name, client->buf_size); if (client->ipc == NULL) { crm_perror(LOG_INFO, "Could not establish %s connection", client->name); return FALSE; } client->pfd.fd = crm_ipc_get_fd(client); if(client->pfd.fd < 0) { crm_perror(LOG_INFO, "Could not obtain file descriptor for %s connection", client->name); return FALSE; } qb_ipcc_context_set(client->ipc, client); return TRUE; } void crm_ipc_close(crm_ipc_t *client) { if(client) { crm_trace("Disconnecting %s IPC connection %p (%p.%p)", client->name, client, client->ipc); if(client->ipc) { qb_ipcc_connection_t *ipc = client->ipc; client->ipc = NULL; qb_ipcc_disconnect(ipc); } } } void crm_ipc_destroy(crm_ipc_t *client) { if(client) { if(client->ipc && qb_ipcc_is_connected(client->ipc)) { crm_notice("Destroying an active IPC connection to %s", client->name); /* The next line is basically unsafe * * If this connection was attached to mainloop and mainloop is active, * the 'disconnected' callback will end up back here and we'll end * up free'ing the memory twice - something that can still happen * even without this if we destroy a connection and it closes before * we call exit */ /* crm_ipc_close(client); */ } crm_trace("Destroying IPC connection to %s: %p", client->name, client); free(client->buffer); free(client->name); free(client); } } int crm_ipc_get_fd(crm_ipc_t *client) { int fd = 0; CRM_ASSERT(client != NULL); if(client->ipc && qb_ipcc_fd_get(client->ipc, &fd) == 0) { return fd; } crm_perror(LOG_ERR, "Could not obtain file IPC descriptor for %s", client->name); return -EINVAL; } bool crm_ipc_connected(crm_ipc_t *client) { bool rc = FALSE; if(client == NULL) { crm_trace("No client"); return FALSE; } else if(client->ipc == NULL) { crm_trace("No connection"); return FALSE; } else if(client->pfd.fd < 0) { crm_trace("Bad descriptor"); return FALSE; } rc = qb_ipcc_is_connected(client->ipc); if(rc == FALSE) { client->pfd.fd = -EINVAL; } return rc; } int crm_ipc_ready(crm_ipc_t *client) { CRM_ASSERT(client != NULL); if(crm_ipc_connected(client) == FALSE) { return -ENOTCONN; } client->pfd.revents = 0; return poll(&(client->pfd), 1, 0); } long crm_ipc_read(crm_ipc_t *client) { CRM_ASSERT(client != NULL); CRM_ASSERT(client->ipc != NULL); CRM_ASSERT(client->buffer != NULL); client->buffer[0] = 0; client->msg_size = qb_ipcc_event_recv(client->ipc, client->buffer, client->buf_size-1, 0); if(client->msg_size >= 0) { struct qb_ipc_response_header *header = (struct qb_ipc_response_header *)client->buffer; client->buffer[client->msg_size] = 0; crm_trace("Recieved %s event %d, size=%d, rc=%d, text: %.200s", client->name, header->id, header->size, client->msg_size, client->buffer+sizeof(struct qb_ipc_response_header)); } else { crm_trace("No message from %s recieved: %s", client->name, pcmk_strerror(client->msg_size)); } if(crm_ipc_connected(client) == FALSE || client->msg_size == -ENOTCONN) { crm_err("Connection to %s failed", client->name); } return client->msg_size; } const char * crm_ipc_buffer(crm_ipc_t *client) { CRM_ASSERT(client != NULL); return client->buffer + sizeof(struct qb_ipc_response_header); } const char *crm_ipc_name(crm_ipc_t *client) { CRM_ASSERT(client != NULL); return client->name; } static int internal_ipc_send_recv(crm_ipc_t *client, const void *iov) { int rc = 0; do { rc = qb_ipcc_sendv_recv(client->ipc, iov, 2, client->buffer, client->buf_size, -1); } while(rc == -EAGAIN && crm_ipc_connected(client)); return rc; } static int internal_ipc_send_request(crm_ipc_t *client, const void *iov, int ms_timeout) { int rc = 0; time_t timeout = time(NULL) + 1 + (ms_timeout / 1000); do { rc = qb_ipcc_sendv(client->ipc, iov, 2); } while(rc == -EAGAIN && time(NULL) < timeout && crm_ipc_connected(client)); return rc; } static int internal_ipc_get_reply(crm_ipc_t *client, int request_id, int ms_timeout) { time_t timeout = time(NULL) + 1 + (ms_timeout / 1000); + struct timespec waitsleep = { + .tv_sec = 0, + .tv_nsec = 500000000 + }; int rc = 0; /* get the reply */ crm_trace("client %s waiting on reply to msg id %d", client->name, request_id); do { + +#if HAVE_QB_IPCC_READY + qb_ipcc_ready(client->ipc, ms_timeout); +#else + if(rc <= 0) { + nanosleep(&waitsleep, 0); + } +#endif + rc = qb_ipcc_recv(client->ipc, client->buffer, client->buf_size, 500); if(rc > 0) { struct qb_ipc_response_header *hdr = (struct qb_ipc_response_header *)client->buffer; if(hdr->id == request_id) { /* Got it */ break; } else if(hdr->id < request_id){ xmlNode *bad = string2xml(crm_ipc_buffer(client)); crm_err("Discarding old reply %d (need %d)", hdr->id, request_id); crm_log_xml_notice(bad, "OldIpcReply"); } else { xmlNode *bad = string2xml(crm_ipc_buffer(client)); crm_err("Discarding newer reply %d (need %d)", hdr->id, request_id); crm_log_xml_notice(bad, "ImpossibleReply"); CRM_ASSERT(hdr->id <= request_id); } } else if (crm_ipc_connected(client) == FALSE) { crm_err("Server disconnected client %s while waiting for msg id %d", client->name, request_id); break; } } while(time(NULL) < timeout); return rc; } int crm_ipc_send(crm_ipc_t *client, xmlNode *message, enum crm_ipc_flags flags, int32_t ms_timeout, xmlNode **reply) { long rc = 0; struct iovec iov[2]; static uint32_t id = 0; struct crm_ipc_request_header header; char *buffer = NULL; if(crm_ipc_connected(client) == FALSE) { /* Don't even bother */ crm_notice("Connection to %s closed", client->name); return -ENOTCONN; } if(client->need_reply) { crm_trace("Trying again to obtain pending reply from %s", client->name); rc = qb_ipcc_recv(client->ipc, client->buffer, client->buf_size, 300); if(rc < 0) { crm_warn("Sending to %s is disabled until pending reply is recieved", client->name); free(buffer); return -EREMOTEIO; } else { crm_notice("Lost reply from %s finally arrived, sending re-enabled", client->name); client->need_reply = FALSE; } } buffer = dump_xml_unformatted(message); iov[0].iov_len = sizeof(struct crm_ipc_request_header); iov[0].iov_base = &header; iov[1].iov_len = 1 + strlen(buffer); iov[1].iov_base = buffer; header.qb.id = ++id; header.qb.size = iov[0].iov_len + iov[1].iov_len; header.flags = flags; if(ms_timeout == 0) { ms_timeout = 5000; } crm_trace("Sending from client: %s request id: %d bytes: %u timeout:%d msg: %.200s...", client->name, header.qb.id, header.qb.size, ms_timeout, buffer); if(ms_timeout > 0) { rc = internal_ipc_send_request(client, iov, ms_timeout); if (rc <= 0) { crm_trace("Failed to send from client %s request %d with %u bytes: %.200s...", client->name, header.qb.id, header.qb.size, buffer); goto send_cleanup; } else if(is_not_set(flags, crm_ipc_client_response)) { crm_trace("Message sent, not waiting for reply to %d from %s to %u bytes: %.200s...", header.qb.id, client->name, header.qb.size, buffer); goto send_cleanup; } rc = internal_ipc_get_reply(client, header.qb.id, ms_timeout); if(rc < 0) { /* No reply, for now, disable sending * * The alternative is to close the connection since we don't know * how to detect and discard out-of-sequence replies * * TODO - implement the above */ client->need_reply = TRUE; } } else { rc = internal_ipc_send_recv(client, iov); } if(rc > 0) { struct qb_ipc_response_header *hdr = (struct qb_ipc_response_header *)client->buffer; crm_trace("Recieved response %d, size=%d, rc=%ld, text: %.200s", hdr->id, hdr->size, rc, crm_ipc_buffer(client)); if(reply) { *reply = string2xml(crm_ipc_buffer(client)); } } else { crm_trace("Response not recieved: rc=%ld, errno=%d", rc, errno); } send_cleanup: if(crm_ipc_connected(client) == FALSE) { crm_notice("Connection to %s closed: %s (%ld)", client->name, pcmk_strerror(rc), rc); } else if(rc <= 0) { crm_warn("Request %d to %s failed: %s (%ld)", header.qb.id, client->name, pcmk_strerror(rc), rc); crm_info("Request was %.120s", buffer); } free(buffer); return rc; } /* Utils */ xmlNode * create_hello_message(const char *uuid, const char *client_name, const char *major_version, const char *minor_version) { xmlNode *hello_node = NULL; xmlNode *hello = NULL; if (uuid == NULL || strlen(uuid) == 0 || client_name == NULL || strlen(client_name) == 0 || major_version == NULL || strlen(major_version) == 0 || minor_version == NULL || strlen(minor_version) == 0) { crm_err("Missing fields, Hello message will not be valid."); return NULL; } hello_node = create_xml_node(NULL, XML_TAG_OPTIONS); crm_xml_add(hello_node, "major_version", major_version); crm_xml_add(hello_node, "minor_version", minor_version); crm_xml_add(hello_node, "client_name", client_name); crm_xml_add(hello_node, "client_uuid", uuid); crm_trace("creating hello message"); hello = create_request(CRM_OP_HELLO, hello_node, NULL, NULL, client_name, uuid); free_xml(hello_node); return hello; } diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c index a076eb45a1..e764fe3f1a 100644 --- a/lib/fencing/st_client.c +++ b/lib/fencing/st_client.c @@ -1,2276 +1,2360 @@ /* * Copyright (c) 2004 Andrew Beekhof * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* Add it for compiling on OSX */ #include #include #include #include #include #ifdef HAVE_STONITH_STONITH_H # include # define LHA_STONITH_LIBRARY "libstonith.so.1" static void *lha_agents_lib = NULL; #endif #include CRM_TRACE_INIT_DATA(stonith); struct stonith_action_s { + /*! user defined data */ char *agent; char *action; char *victim; char *args; int timeout; int async; void *userdata; void (*done_cb)(GPid pid, gint status, const char *output, gpointer user_data); - /* async track data */ + /*! internal async track data */ int fd_stdout; int last_timeout_signo; + + /*! internal timing information */ + time_t initial_start_time; + int tries; + int remaining_timeout; guint timer_sigterm; guint timer_sigkill; - /* output data */ + /* device output data */ GPid pid; int rc; char *output; }; typedef struct stonith_private_s { char *token; crm_ipc_t *ipc; mainloop_io_t *source; GHashTable *stonith_op_callback_table; GList *notify_list; void (*op_callback) (stonith_t * st, stonith_callback_data_t *data); } stonith_private_t; typedef struct stonith_notify_client_s { const char *event; const char *obj_id; /* implement one day */ const char *obj_type; /* implement one day */ void (*notify) (stonith_t * st, stonith_event_t *e); } stonith_notify_client_t; typedef struct stonith_callback_client_s { void (*callback) (stonith_t * st, stonith_callback_data_t *data); const char *id; void *user_data; gboolean only_success; gboolean allow_timeout_updates; struct timer_rec_s *timer; } stonith_callback_client_t; struct notify_blob_s { stonith_t *stonith; xmlNode *xml; }; struct timer_rec_s { int call_id; int timeout; guint ref; stonith_t *stonith; }; typedef int (*stonith_op_t) (const char *, int, const char *, xmlNode *, xmlNode *, xmlNode *, xmlNode **, xmlNode **); static const char META_TEMPLATE[] = "\n" "\n" "\n" " 1.0\n" " \n" "%s\n" " \n" " %s\n" "%s\n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " 2.0\n" " \n" "\n"; bool stonith_dispatch(stonith_t * st); int stonith_dispatch_internal(const char *buffer, ssize_t length, gpointer userdata); void stonith_perform_callback(stonith_t * stonith, xmlNode * msg, int call_id, int rc); xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, int call_options); int stonith_send_command(stonith_t * stonith, const char *op, xmlNode * data, xmlNode ** output_data, int call_options, int timeout); static void stonith_connection_destroy(gpointer user_data); static void stonith_send_notification(gpointer data, gpointer user_data); +static int internal_stonith_action_execute(stonith_action_t *action); static void stonith_connection_destroy(gpointer user_data) { stonith_t *stonith = user_data; stonith_private_t *native = NULL; struct notify_blob_s blob; crm_trace("Sending destroyed notification"); blob.stonith = stonith; blob.xml = create_xml_node(NULL, "notify"); native = stonith->private; native->ipc = NULL; native->source = NULL; stonith->state = stonith_disconnected; crm_xml_add(blob.xml, F_TYPE, T_STONITH_NOTIFY); crm_xml_add(blob.xml, F_SUBTYPE, T_STONITH_NOTIFY_DISCONNECT); g_list_foreach(native->notify_list, stonith_send_notification, &blob); free_xml(blob.xml); } xmlNode * create_device_registration_xml(const char *id, const char *namespace, const char *agent, stonith_key_value_t * params) { xmlNode *data = create_xml_node(NULL, F_STONITH_DEVICE); xmlNode *args = create_xml_node(data, XML_TAG_ATTRS); crm_xml_add(data, XML_ATTR_ID, id); crm_xml_add(data, "origin", __FUNCTION__); crm_xml_add(data, "agent", agent); crm_xml_add(data, "namespace", namespace); for (; params; params = params->next) { hash2field((gpointer) params->key, (gpointer) params->value, args); } return data; } static int stonith_api_register_device(stonith_t * st, int call_options, const char *id, const char *namespace, const char *agent, stonith_key_value_t * params) { int rc = 0; xmlNode *data = NULL; #if HAVE_STONITH_STONITH_H namespace = get_stonith_provider(agent, namespace); if (safe_str_eq(namespace, "heartbeat")) { stonith_key_value_add(params, "plugin", agent); agent = "fence_legacy"; } #endif data = create_device_registration_xml(id, namespace, agent, params); rc = stonith_send_command(st, STONITH_OP_DEVICE_ADD, data, NULL, call_options, 0); free_xml(data); return rc; } static int stonith_api_remove_device(stonith_t * st, int call_options, const char *name) { int rc = 0; xmlNode *data = NULL; data = create_xml_node(NULL, F_STONITH_DEVICE); crm_xml_add(data, "origin", __FUNCTION__); crm_xml_add(data, XML_ATTR_ID, name); rc = stonith_send_command(st, STONITH_OP_DEVICE_DEL, data, NULL, call_options, 0); free_xml(data); return rc; } static int stonith_api_remove_level(stonith_t * st, int options, const char *node, int level) { int rc = 0; xmlNode *data = NULL; data = create_xml_node(NULL, F_STONITH_LEVEL); crm_xml_add(data, "origin", __FUNCTION__); crm_xml_add(data, F_STONITH_TARGET, node); crm_xml_add_int(data, XML_ATTR_ID, level); rc = stonith_send_command(st, STONITH_OP_LEVEL_DEL, data, NULL, options, 0); free_xml(data); return rc; } xmlNode * create_level_registration_xml(const char *node, int level, stonith_key_value_t * device_list) { xmlNode *data = create_xml_node(NULL, F_STONITH_LEVEL); crm_xml_add_int(data, XML_ATTR_ID, level); crm_xml_add(data, F_STONITH_TARGET, node); crm_xml_add(data, "origin", __FUNCTION__); for (; device_list; device_list = device_list->next) { xmlNode *dev = create_xml_node(data, F_STONITH_DEVICE); crm_xml_add(dev, XML_ATTR_ID, device_list->value); } return data; } static int stonith_api_register_level(stonith_t * st, int options, const char *node, int level, stonith_key_value_t * device_list) { int rc = 0; xmlNode *data = create_level_registration_xml(node, level, device_list); rc = stonith_send_command(st, STONITH_OP_LEVEL_ADD, data, NULL, options, 0); free_xml(data); return rc; } static void append_arg(gpointer key, gpointer value, gpointer user_data) { int len = 3; /* =, \n, \0 */ int last = 0; char **args = user_data; CRM_CHECK(key != NULL, return); CRM_CHECK(value != NULL, return); if (strstr(key, "pcmk_")) { return; } else if (strstr(key, CRM_META)) { return; } else if (safe_str_eq(key, "crm_feature_set")) { return; } len += strlen(key); len += strlen(value); if (*args != NULL) { last = strlen(*args); } *args = realloc(*args, last + len); crm_trace("Appending: %s=%s", (char *)key, (char *)value); sprintf((*args) + last, "%s=%s\n", (char *)key, (char *)value); } static void append_const_arg(const char *key, const char *value, char **arg_list) { char *glib_sucks_key = strdup(key); char *glib_sucks_value = strdup(value); append_arg(glib_sucks_key, glib_sucks_value, arg_list); free(glib_sucks_value); free(glib_sucks_key); } static void append_host_specific_args(const char *victim, const char *map, GHashTable * params, char **arg_list) { char *name = NULL; int last = 0, lpc = 0, max = 0; if (map == NULL) { /* The best default there is for now... */ crm_debug("Using default arg map: port=uname"); append_const_arg("port", victim, arg_list); return; } max = strlen(map); crm_debug("Processing arg map: %s", map); for (; lpc < max + 1; lpc++) { if (isalpha(map[lpc])) { /* keep going */ } else if (map[lpc] == '=' || map[lpc] == ':') { free(name); name = calloc(1, 1 + lpc - last); memcpy(name, map + last, lpc - last); crm_debug("Got name: %s", name); last = lpc + 1; } else if (map[lpc] == 0 || map[lpc] == ',' || isspace(map[lpc])) { char *param = NULL; const char *value = NULL; param = calloc(1, 1 + lpc - last); memcpy(param, map + last, lpc - last); last = lpc + 1; crm_debug("Got key: %s", param); if (name == NULL) { crm_err("Misparsed '%s', found '%s' without a name", map, param); free(param); continue; } if (safe_str_eq(param, "uname")) { value = victim; } else { char *key = crm_meta_name(param); value = g_hash_table_lookup(params, key); free(key); } if (value) { crm_debug("Setting '%s'='%s' (%s) for %s", name, value, param, victim); append_const_arg(name, value, arg_list); } else { crm_err("No node attribute '%s' for '%s'", name, victim); } free(name); name = NULL; free(param); if (map[lpc] == 0) { break; } } else if (isspace(map[lpc])) { last = lpc; } } free(name); } static char * make_args(const char *action, const char *victim, GHashTable * device_args, GHashTable * port_map) { char buffer[512]; char *arg_list = NULL; const char *value = NULL; CRM_CHECK(action != NULL, return NULL); if (device_args) { g_hash_table_foreach(device_args, append_arg, &arg_list); } buffer[511] = 0; snprintf(buffer, 511, "pcmk_%s_action", action); if (device_args) { value = g_hash_table_lookup(device_args, buffer); } if (value == NULL && device_args) { /* Legacy support for early 1.1 releases - Remove for 1.4 */ snprintf(buffer, 511, "pcmk_%s_cmd", action); value = g_hash_table_lookup(device_args, buffer); } if (value == NULL && device_args && safe_str_eq(action, "off")) { /* Legacy support for late 1.1 releases - Remove for 1.4 */ value = g_hash_table_lookup(device_args, "pcmk_poweroff_action"); } if (value) { crm_info("Substituting action '%s' for requested operation '%s'", value, action); action = value; } append_const_arg(STONITH_ATTR_ACTION_OP, action, &arg_list); if (victim && device_args) { const char *alias = victim; const char *param = g_hash_table_lookup(device_args, STONITH_ATTR_HOSTARG); if (port_map && g_hash_table_lookup(port_map, victim)) { alias = g_hash_table_lookup(port_map, victim); } /* Always supply the node's name too: * https://fedorahosted.org/cluster/wiki/FenceAgentAPI */ append_const_arg("nodename", victim, &arg_list); /* Check if we need to supply the victim in any other form */ if (param == NULL) { const char *map = g_hash_table_lookup(device_args, STONITH_ATTR_ARGMAP); if (map == NULL) { param = "port"; value = g_hash_table_lookup(device_args, param); } else { /* Legacy handling */ append_host_specific_args(alias, map, device_args, &arg_list); value = map; /* Nothing more to do */ } } else if (safe_str_eq(param, "none")) { value = param; /* Nothing more to do */ } else { value = g_hash_table_lookup(device_args, param); } /* Don't overwrite explictly set values for $param */ if (value == NULL || safe_str_eq(value, "dynamic")) { crm_debug("Performing %s action for node '%s' as '%s=%s'", action, victim, param, alias); append_const_arg(param, alias, &arg_list); } } crm_trace("Calculated: %s", arg_list); return arg_list; } static gboolean st_child_term(gpointer data) { int rc = 0; stonith_action_t *track = data; crm_info("Child %d timed out, sending SIGTERM", track->pid); track->timer_sigterm = 0; track->last_timeout_signo = SIGTERM; rc = kill(track->pid, SIGTERM); if(rc < 0) { crm_perror(LOG_ERR, "Couldn't send SIGTERM to %d", track->pid); } return FALSE; } static gboolean st_child_kill(gpointer data) { int rc = 0; stonith_action_t *track = data; crm_info("Child %d timed out, sending SIGKILL", track->pid); track->timer_sigkill = 0; track->last_timeout_signo = SIGKILL; rc = kill(track->pid, SIGKILL); if(rc < 0) { crm_perror(LOG_ERR, "Couldn't send SIGKILL to %d", track->pid); } return FALSE; } static void -stonith_action_destroy(stonith_action_t *action) +stonith_action_clear_tracking_data(stonith_action_t *action) { - if (action->timer_sigterm > 0) { g_source_remove(action->timer_sigterm); + action->timer_sigterm = 0; } if (action->timer_sigkill > 0) { g_source_remove(action->timer_sigkill); + action->timer_sigkill = 0; } - if (action->fd_stdout) { close(action->fd_stdout); + action->fd_stdout = 0; } + free(action->output); + action->output = NULL; + action->rc = 0; + action->pid = 0; + action->last_timeout_signo = 0; +} + +static void +stonith_action_destroy(stonith_action_t *action) +{ + stonith_action_clear_tracking_data(action); free(action->agent); free(action->args); - free(action->output); free(action->action); free(action->victim); free(action); } stonith_action_t * stonith_action_create(const char *agent, const char *_action, const char *victim, int timeout, GHashTable * device_args, GHashTable * port_map) { stonith_action_t *action; action = calloc(1, sizeof(stonith_action_t)); crm_info("Initiating action %s for agent %s (target=%s)", _action, agent, victim); action->args = make_args(_action, victim, device_args, port_map); action->agent = strdup(agent); action->action = strdup(_action); if (victim) { action->victim = strdup(victim); } - action->timeout = timeout; + action->timeout = action->remaining_timeout = timeout; return action; } #define READ_MAX 500 static char* read_output(int fd) { char buffer[READ_MAX]; char *output = NULL; int len = 0; int more = 0; if (!fd) { return NULL; } do { errno = 0; memset(&buffer, 0, READ_MAX); more = read(fd, buffer, READ_MAX-1); if (more > 0) { crm_trace("Got %d more bytes: %s", more, buffer); output = realloc(output, len + more + 1); sprintf(output+len, "%s", buffer); len += more; } } while (more == (READ_MAX-1) || (more < 0 && errno == EINTR)); return output; } +#define FAILURE_MAX_RETRIES 10 +static gboolean +update_remaining_timeout(stonith_action_t *action) +{ + int diff = time(NULL) - action->initial_start_time; + + if (action->tries >= FAILURE_MAX_RETRIES) { + crm_info("Attempted to execute agent %s (%s) the maximum number of times (%d) allowed", + action->agent, action->action, FAILURE_MAX_RETRIES); + action->remaining_timeout = 0; + } else if ((action->rc != -ETIME) && diff < (action->timeout * 0.7)) { + /* only set remaining timeout period if there is 30% + * or greater of the original timeout period left */ + action->remaining_timeout = action->timeout - diff; + } else { + action->remaining_timeout = 0; + } + return action->remaining_timeout ? TRUE : FALSE; +} + static void stonith_action_async_done(GPid pid, gint status, gpointer user_data) { int rc = -pcmk_err_generic; stonith_action_t *action = user_data; if (action->timer_sigterm > 0) { g_source_remove(action->timer_sigterm); } if (action->timer_sigkill > 0) { g_source_remove(action->timer_sigkill); } if(action->last_timeout_signo) { rc = -ETIME; crm_notice("Child process %d performing action '%s' timed out with signal %d", pid, action->action, action->last_timeout_signo); } else if(WIFSIGNALED(status)) { int signo = WTERMSIG(status); rc = -ECONNABORTED; crm_notice("Child process %d performing action '%s' timed out with signal %d", pid, action->action, signo); } else if(WIFEXITED(status)) { rc = WEXITSTATUS(status); crm_debug("Child process %d performing action '%s' exited with rc %d", pid, action->action, rc); } action->rc = rc; action->output = read_output(action->fd_stdout); + if (action->rc != pcmk_ok && update_remaining_timeout(action)) { + rc = internal_stonith_action_execute(action); + if (rc == pcmk_ok) { + return; + } + } + if (action->done_cb) { action->done_cb(pid, action->rc, action->output, action->userdata); } stonith_action_destroy(action); } static int internal_stonith_action_execute(stonith_action_t *action) { int pid, status, len, rc = -EPROTO; int ret; int total = 0; int p_read_fd, p_write_fd; /* parent read/write file descriptors */ int c_read_fd, c_write_fd; /* child read/write file descriptors */ int fd1[2]; int fd2[2]; + int is_retry = 0; + + /* clear any previous tracking data */ + stonith_action_clear_tracking_data(action); + + if (!action->tries) { + action->initial_start_time = time(NULL); + } + action->tries++; + + if (action->tries > 1) { + crm_info("Attempt %d to execute %s (%s). remaining timeout is %d", + action->tries, action->agent, action->action, action->remaining_timeout); + is_retry = 1; + } c_read_fd = c_write_fd = p_read_fd = p_write_fd = -1; if (action->args == NULL || action->agent == NULL) goto fail; len = strlen(action->args); if (pipe(fd1)) goto fail; p_read_fd = fd1[0]; c_write_fd = fd1[1]; if (pipe(fd2)) goto fail; c_read_fd = fd2[0]; p_write_fd = fd2[1]; crm_debug("forking"); pid = fork(); if (pid < 0) { rc = -ECHILD; goto fail; } if (!pid) { /* child */ close(1); /* coverity[leaked_handle] False positive */ if (dup(c_write_fd) < 0) goto fail; close(2); /* coverity[leaked_handle] False positive */ if (dup(c_write_fd) < 0) goto fail; close(0); /* coverity[leaked_handle] False positive */ if (dup(c_read_fd) < 0) goto fail; /* keep c_write_fd open so parent can report all errors. */ close(c_read_fd); close(p_read_fd); close(p_write_fd); + /* keep retries from executing out of control */ + if (is_retry) { + sleep(1); + } execlp(action->agent, action->agent, NULL); exit(EXIT_FAILURE); } /* parent */ action->pid = pid; ret = fcntl(p_read_fd, F_SETFL, fcntl(p_read_fd, F_GETFL, 0) | O_NONBLOCK); if(ret < 0) { crm_perror(LOG_NOTICE, "Could not change the output of %s to be non-blocking", action->agent); } do { crm_debug("sending args"); ret = write(p_write_fd, action->args + total, len - total); if (ret > 0) { total += ret; } } while (errno == EINTR && total < len); if (total != len) { crm_perror(LOG_ERR, "Sent %d not %d bytes", total, len); if (ret >= 0) { rc = -EREMOTEIO; } goto fail; } close(p_write_fd); /* async */ if (action->async) { action->fd_stdout = p_read_fd; g_child_watch_add(pid, stonith_action_async_done, action); - crm_trace("Op: %s on %s, pid: %d, timeout: %ds", action->action, action->agent, pid, action->timeout); + crm_trace("Op: %s on %s, pid: %d, timeout: %ds", action->action, action->agent, pid, action->remaining_timeout); action->last_timeout_signo = 0; - if (action->timeout) { - action->timer_sigterm = g_timeout_add(1000*action->timeout, st_child_term, action); - action->timer_sigkill = g_timeout_add(1000*(action->timeout+5), st_child_kill, action); + if (action->remaining_timeout) { + action->timer_sigterm = g_timeout_add(1000*action->remaining_timeout, st_child_term, action); + action->timer_sigkill = g_timeout_add(1000*(action->remaining_timeout+5), st_child_kill, action); } else { crm_err("No timeout set for stonith operation %s with device %s", action->action, action->agent); } close(c_write_fd); close(c_read_fd); return 0; } else { /* sync */ - int timeout = action->timeout + 1; + int timeout = action->remaining_timeout + 1; pid_t p = 0; - while (action->timeout < 0 || timeout > 0) { + while (action->remaining_timeout < 0 || timeout > 0) { p = waitpid(pid, &status, WNOHANG); if (p > 0) { break; } sleep(1); timeout--; } if (timeout == 0) { int killrc = kill(pid, 9 /*SIGKILL*/); if (killrc && errno != ESRCH) { crm_err("kill(%d, KILL) failed: %s (%d)", pid, pcmk_strerror(errno), errno); } p = waitpid(pid, &status, WNOHANG); } if (p <= 0) { crm_perror(LOG_ERR, "waitpid(%d)", pid); } else if (p != pid) { crm_err("Waited for %d, got %d", pid, p); } action->output = read_output(p_read_fd); action->rc = -ECONNABORTED; rc = action->rc; if (timeout == 0) { action->rc = -ETIME; } else if (WIFEXITED(status)) { crm_debug("result = %d", WEXITSTATUS(status)); action->rc = -WEXITSTATUS(status); rc = 0; } else if (WIFSIGNALED(status)) { crm_err("call %s for %s exited due to signal %d", action->action, action->agent, WTERMSIG(status)); } else { crm_err("call %s for %s exited abnormally. stopped=%d, continued=%d", action->action, action->agent, WIFSTOPPED(status), WIFCONTINUED(status)); } } fail: if (p_read_fd >= 0) { close(p_read_fd); } if (p_write_fd >= 0) { close(p_write_fd); } if (c_read_fd >= 0) { close(c_read_fd); } if (c_write_fd >= 0) { close(c_write_fd); } return rc; } GPid stonith_action_execute_async(stonith_action_t *action, void *userdata, void (*done)(GPid pid, int rc, const char *output, gpointer user_data)) { int rc = 0; if (!action) { return -1; } action->userdata = userdata; action->done_cb = done; action->async = 1; rc = internal_stonith_action_execute(action); return rc ? rc : action->pid; } int stonith_action_execute(stonith_action_t *action, int *agent_result, char **output) { int rc = 0; if (!action) { return -1; } - rc = internal_stonith_action_execute(action); + do { + rc = internal_stonith_action_execute(action); + if (rc == pcmk_ok) { + /* success! */ + break; + } + /* keep retrying while we have time left */ + } while (update_remaining_timeout(action)); + if (rc) { /* error */ return rc; } if (agent_result) { *agent_result = action->rc; } if (output) { *output = action->output; action->output = NULL; /* handed it off, do not free */ } stonith_action_destroy(action); return rc; } static int stonith_api_device_list(stonith_t * stonith, int call_options, const char *namespace, stonith_key_value_t ** devices, int timeout) { int count = 0; if (devices == NULL) { crm_err("Parameter error: stonith_api_device_list"); return -EFAULT; } /* Include Heartbeat agents */ if (namespace == NULL || safe_str_eq("heartbeat", namespace)) { #if HAVE_STONITH_STONITH_H static gboolean need_init = TRUE; char **entry = NULL; char **type_list = NULL; static char **(*type_list_fn) (void) = NULL; static void (*type_free_fn) (char **) = NULL; if(need_init) { need_init = FALSE; type_list_fn = find_library_function(&lha_agents_lib, LHA_STONITH_LIBRARY, "stonith_types", FALSE); type_free_fn = find_library_function(&lha_agents_lib, LHA_STONITH_LIBRARY, "stonith_free_hostlist", FALSE); } if(type_list_fn) { type_list = (*type_list_fn)(); } for (entry = type_list; entry != NULL && *entry; ++entry) { crm_trace("Added: %s", *entry); *devices = stonith_key_value_add(*devices, NULL, *entry); count++; } if (type_list && type_free_fn) { (*type_free_fn)(type_list); } #else if(namespace != NULL) { return -EINVAL; /* Heartbeat agents not supported */ } #endif } /* Include Red Hat agents, basically: ls -1 @sbin_dir@/fence_* */ if (namespace == NULL || safe_str_eq("redhat", namespace)) { struct dirent **namelist; int file_num = scandir(RH_STONITH_DIR, &namelist, 0, alphasort); if (file_num > 0) { struct stat prop; char buffer[FILENAME_MAX + 1]; while (file_num--) { if ('.' == namelist[file_num]->d_name[0]) { free(namelist[file_num]); continue; } else if (0 != strncmp(RH_STONITH_PREFIX, namelist[file_num]->d_name, strlen(RH_STONITH_PREFIX))) { free(namelist[file_num]); continue; } snprintf(buffer, FILENAME_MAX, "%s/%s", RH_STONITH_DIR, namelist[file_num]->d_name); if (stat(buffer, &prop) == 0 && S_ISREG(prop.st_mode)) { *devices = stonith_key_value_add(*devices, NULL, namelist[file_num]->d_name); count++; } free(namelist[file_num]); } free(namelist); } } return count; } static int stonith_api_device_metadata(stonith_t * stonith, int call_options, const char *agent, const char *namespace, char **output, int timeout) { int rc = 0; char *buffer = NULL; const char *provider = get_stonith_provider(agent, namespace); crm_trace("looking up %s/%s metadata", agent, provider); /* By having this in a library, we can access it from stonith_admin * when neither lrmd or stonith-ng are running * Important for the crm shell's validations... */ if (safe_str_eq(provider, "redhat")) { stonith_action_t *action = stonith_action_create(agent, "metadata", NULL, 5, NULL, NULL); int exec_rc = stonith_action_execute(action, &rc, &buffer); if (exec_rc < 0 || rc != 0 || buffer == NULL) { crm_debug("Query failed: %d %d: %s", exec_rc, rc, crm_str(buffer)); free(buffer); /* Just in case */ return -EINVAL; } else { xmlNode *xml = string2xml(buffer); xmlNode *actions = NULL; xmlXPathObject *xpathObj = NULL; xpathObj = xpath_search(xml, "//actions"); if (xpathObj && xpathObj->nodesetval->nodeNr > 0) { actions = getXpathResult(xpathObj, 0); } /* Now fudge the metadata so that the start/stop actions appear */ xpathObj = xpath_search(xml, "//action[@name='stop']"); if (xpathObj == NULL || xpathObj->nodesetval->nodeNr <= 0) { xmlNode *tmp = NULL; tmp = create_xml_node(actions, "action"); crm_xml_add(tmp, "name", "stop"); crm_xml_add(tmp, "timeout", "20s"); tmp = create_xml_node(actions, "action"); crm_xml_add(tmp, "name", "start"); crm_xml_add(tmp, "timeout", "20s"); } /* Now fudge the metadata so that the port isn't required in the configuration */ xpathObj = xpath_search(xml, "//parameter[@name='port']"); if (xpathObj && xpathObj->nodesetval->nodeNr > 0) { /* We'll fill this in */ xmlNode *tmp = getXpathResult(xpathObj, 0); crm_xml_add(tmp, "required", "0"); } free(buffer); buffer = dump_xml_formatted(xml); free_xml(xml); if (!buffer) { return -EINVAL; } } } else { #if !HAVE_STONITH_STONITH_H return -EINVAL; /* Heartbeat agents not supported */ #else int bufferlen = 0; static const char *no_parameter_info = ""; Stonith *stonith_obj = NULL; static gboolean need_init = TRUE; static Stonith *(*st_new_fn) (const char *) = NULL; static const char *(*st_info_fn) (Stonith *, int) = NULL; static void (*st_del_fn) (Stonith *) = NULL; if(need_init) { need_init = FALSE; st_new_fn = find_library_function(&lha_agents_lib, LHA_STONITH_LIBRARY, "stonith_new", FALSE); st_del_fn = find_library_function(&lha_agents_lib, LHA_STONITH_LIBRARY, "stonith_delete", FALSE); st_info_fn = find_library_function(&lha_agents_lib, LHA_STONITH_LIBRARY, "stonith_get_info", FALSE); } if (lha_agents_lib && st_new_fn && st_del_fn && st_info_fn) { char *xml_meta_longdesc = NULL; char *xml_meta_shortdesc = NULL; char *meta_param = NULL; char *meta_longdesc = NULL; char *meta_shortdesc = NULL; stonith_obj = (*st_new_fn) (agent); if(stonith_obj) { meta_longdesc = strdup((*st_info_fn)(stonith_obj, ST_DEVICEDESCR)); if (meta_longdesc == NULL) { crm_warn("no long description in %s's metadata.", agent); meta_longdesc = strdup(no_parameter_info); } meta_shortdesc = strdup((*st_info_fn)(stonith_obj, ST_DEVICEID)); if (meta_shortdesc == NULL) { crm_warn("no short description in %s's metadata.", agent); meta_shortdesc = strdup(no_parameter_info); } meta_param = strdup((*st_info_fn)(stonith_obj, ST_CONF_XML)); if (meta_param == NULL) { crm_warn("no list of parameters in %s's metadata.", agent); meta_param = strdup(no_parameter_info); } (*st_del_fn)(stonith_obj); } else { return -EINVAL; /* Heartbeat agents not supported */ } xml_meta_longdesc = (char *)xmlEncodeEntitiesReentrant(NULL, (const unsigned char *)meta_longdesc); xml_meta_shortdesc = (char *)xmlEncodeEntitiesReentrant(NULL, (const unsigned char *)meta_shortdesc); bufferlen = strlen(META_TEMPLATE) + strlen(agent) + strlen(xml_meta_longdesc) + strlen(xml_meta_shortdesc) + strlen(meta_param) + 1; buffer = calloc(1, bufferlen); snprintf(buffer, bufferlen - 1, META_TEMPLATE, agent, xml_meta_longdesc, xml_meta_shortdesc, meta_param); xmlFree(xml_meta_longdesc); xmlFree(xml_meta_shortdesc); free(meta_shortdesc); free(meta_longdesc); free(meta_param); } #endif } if (output) { *output = buffer; } else { free(buffer); } return rc; } static int stonith_api_query(stonith_t * stonith, int call_options, const char *target, stonith_key_value_t ** devices, int timeout) { int rc = 0, lpc = 0, max = 0; xmlNode *data = NULL; xmlNode *output = NULL; xmlXPathObjectPtr xpathObj = NULL; CRM_CHECK(devices != NULL, return -EINVAL); data = create_xml_node(NULL, F_STONITH_DEVICE); crm_xml_add(data, "origin", __FUNCTION__); crm_xml_add(data, F_STONITH_TARGET, target); crm_xml_add(data, F_STONITH_ACTION, "off"); rc = stonith_send_command(stonith, STONITH_OP_QUERY, data, &output, call_options, timeout); if (rc < 0) { return rc; } xpathObj = xpath_search(output, "//@agent"); if (xpathObj) { max = xpathObj->nodesetval->nodeNr; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpathObj, lpc); CRM_CHECK(match != NULL, continue); crm_info("%s[%d] = %s", "//@agent", lpc, xmlGetNodePath(match)); *devices = stonith_key_value_add(*devices, NULL, crm_element_value(match, XML_ATTR_ID)); } } free_xml(output); free_xml(data); return max; } static int stonith_api_call(stonith_t * stonith, int call_options, const char *id, const char *action, const char *victim, int timeout, xmlNode **output) { int rc = 0; xmlNode *data = NULL; data = create_xml_node(NULL, F_STONITH_DEVICE); crm_xml_add(data, "origin", __FUNCTION__); crm_xml_add(data, F_STONITH_DEVICE, id); crm_xml_add(data, F_STONITH_ACTION, action); crm_xml_add(data, F_STONITH_TARGET, victim); rc = stonith_send_command(stonith, STONITH_OP_EXEC, data, output, call_options, timeout); free_xml(data); return rc; } static int stonith_api_list(stonith_t * stonith, int call_options, const char *id, char **list_info, int timeout) { int rc; xmlNode *output = NULL; rc = stonith_api_call(stonith, call_options, id, "list", NULL, timeout, &output); if (output && list_info) { const char *list_str; list_str = crm_element_value(output, "st_output"); if (list_str) { *list_info = strdup(list_str); } } if (output) { free_xml(output); } return rc; } static int stonith_api_monitor(stonith_t * stonith, int call_options, const char *id, int timeout) { return stonith_api_call(stonith, call_options, id, "monitor", NULL, timeout, NULL); } static int stonith_api_status(stonith_t * stonith, int call_options, const char *id, const char *port, int timeout) { return stonith_api_call(stonith, call_options, id, "status", port, timeout, NULL); } static int stonith_api_fence(stonith_t * stonith, int call_options, const char *node, const char *action, int timeout, int tolerance) { int rc = 0; xmlNode *data = NULL; data = create_xml_node(NULL, __FUNCTION__); crm_xml_add(data, F_STONITH_TARGET, node); crm_xml_add(data, F_STONITH_ACTION, action); crm_xml_add_int(data, F_STONITH_TIMEOUT, timeout); crm_xml_add_int(data, F_STONITH_TOLERANCE, tolerance); rc = stonith_send_command(stonith, STONITH_OP_FENCE, data, NULL, call_options, timeout); free_xml(data); return rc; } static int stonith_api_confirm(stonith_t * stonith, int call_options, const char *target) { return stonith_api_fence(stonith, call_options | st_opt_manual_ack, target, "off", 0, 0); } static int stonith_api_history(stonith_t * stonith, int call_options, const char *node, stonith_history_t ** history, int timeout) { int rc = 0; xmlNode *data = NULL; xmlNode *output = NULL; stonith_history_t *last = NULL; *history = NULL; if (node) { data = create_xml_node(NULL, __FUNCTION__); crm_xml_add(data, F_STONITH_TARGET, node); } rc = stonith_send_command(stonith, STONITH_OP_FENCE_HISTORY, data, &output, call_options | st_opt_sync_call, timeout); free_xml(data); if (rc == 0) { xmlNode *op = NULL; xmlNode *reply = get_xpath_object("//" F_STONITH_HISTORY_LIST, output, LOG_ERR); for (op = __xml_first_child(reply); op != NULL; op = __xml_next(op)) { stonith_history_t *kvp; kvp = calloc(1, sizeof(stonith_history_t)); kvp->target = crm_element_value_copy(op, F_STONITH_TARGET); kvp->action = crm_element_value_copy(op, F_STONITH_ACTION); kvp->origin = crm_element_value_copy(op, F_STONITH_ORIGIN); kvp->delegate = crm_element_value_copy(op, F_STONITH_DELEGATE); crm_element_value_int(op, F_STONITH_DATE, &kvp->completed); crm_element_value_int(op, F_STONITH_STATE, &kvp->state); if (last) { last->next = kvp; } else { *history = kvp; } last = kvp; } } return rc; } gboolean is_redhat_agent(const char *agent) { int rc = 0; struct stat prop; char buffer[FILENAME_MAX + 1]; snprintf(buffer, FILENAME_MAX, "%s/%s", RH_STONITH_DIR, agent); rc = stat(buffer, &prop); if (rc >= 0 && S_ISREG(prop.st_mode)) { return TRUE; } return FALSE; } const char * get_stonith_provider(const char *agent, const char *provider) { /* This function sucks */ if (is_redhat_agent(agent)) { return "redhat"; #if HAVE_STONITH_STONITH_H } else { Stonith *stonith_obj = NULL; static gboolean need_init = TRUE; static Stonith *(*st_new_fn) (const char *) = NULL; static void (*st_del_fn) (Stonith *) = NULL; if(need_init) { need_init = FALSE; st_new_fn = find_library_function(&lha_agents_lib, LHA_STONITH_LIBRARY, "stonith_new", FALSE); st_del_fn = find_library_function(&lha_agents_lib, LHA_STONITH_LIBRARY, "stonith_delete", FALSE); } if (lha_agents_lib && st_new_fn && st_del_fn) { stonith_obj = (*st_new_fn) (agent); if(stonith_obj) { (*st_del_fn)(stonith_obj); return "heartbeat"; } } #endif } crm_err("No such device: %s", agent); return NULL; } static gint stonithlib_GCompareFunc(gconstpointer a, gconstpointer b) { int rc = 0; const stonith_notify_client_t *a_client = a; const stonith_notify_client_t *b_client = b; CRM_CHECK(a_client->event != NULL && b_client->event != NULL, return 0); rc = strcmp(a_client->event, b_client->event); if (rc == 0) { if (a_client->notify == NULL || b_client->notify == NULL) { return 0; } else if (a_client->notify == b_client->notify) { return 0; } else if (((long)a_client->notify) < ((long)b_client->notify)) { crm_err("callbacks for %s are not equal: %p vs. %p", a_client->event, a_client->notify, b_client->notify); return -1; } crm_err("callbacks for %s are not equal: %p vs. %p", a_client->event, a_client->notify, b_client->notify); return 1; } return rc; } xmlNode * stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data, int call_options) { xmlNode *op_msg = create_xml_node(NULL, "stonith_command"); CRM_CHECK(op_msg != NULL, return NULL); CRM_CHECK(token != NULL, return NULL); crm_xml_add(op_msg, F_XML_TAGNAME, "stonith_command"); crm_xml_add(op_msg, F_TYPE, T_STONITH_NG); crm_xml_add(op_msg, F_STONITH_CALLBACK_TOKEN, token); crm_xml_add(op_msg, F_STONITH_OPERATION, op); crm_xml_add_int(op_msg, F_STONITH_CALLID, call_id); crm_trace("Sending call options: %.8lx, %d", (long)call_options, call_options); crm_xml_add_int(op_msg, F_STONITH_CALLOPTS, call_options); if (data != NULL) { add_message_xml(op_msg, F_STONITH_CALLDATA, data); } return op_msg; } static void stonith_destroy_op_callback(gpointer data) { stonith_callback_client_t *blob = data; if (blob->timer && blob->timer->ref > 0) { g_source_remove(blob->timer->ref); } free(blob->timer); free(blob); } static int stonith_api_signoff(stonith_t * stonith) { stonith_private_t *native = stonith->private; crm_debug("Signing out of the STONITH Service"); if (native->source != NULL) { /* Attached to mainloop */ mainloop_del_ipc_client(native->source); native->source = NULL; native->ipc = NULL; } else if(native->ipc) { /* Not attached to mainloop */ crm_ipc_t *ipc = native->ipc; native->ipc = NULL; crm_ipc_close(ipc); crm_ipc_destroy(ipc); } stonith->state = stonith_disconnected; return pcmk_ok; } static int stonith_api_signon(stonith_t * stonith, const char *name, int *stonith_fd) { int rc = pcmk_ok; stonith_private_t *native = stonith->private; static struct ipc_client_callbacks st_callbacks = { .dispatch = stonith_dispatch_internal, .destroy = stonith_connection_destroy }; crm_trace("Connecting command channel"); stonith->state = stonith_connected_command; if(stonith_fd) { /* No mainloop */ native->ipc = crm_ipc_new("stonith-ng", 0); if(native->ipc && crm_ipc_connect(native->ipc)) { *stonith_fd = crm_ipc_get_fd(native->ipc); } else if(native->ipc) { rc = -ENOTCONN; } } else { /* With mainloop */ native->source = mainloop_add_ipc_client("stonith-ng", G_PRIORITY_MEDIUM, 0, stonith, &st_callbacks); native->ipc = mainloop_get_ipc_client(native->source); } if (native->ipc == NULL) { crm_debug("Could not connect to the Stonith API"); rc = -ENOTCONN; } if (rc == pcmk_ok) { xmlNode *reply = NULL; xmlNode *hello = create_xml_node(NULL, "stonith_command"); crm_xml_add(hello, F_TYPE, T_STONITH_NG); crm_xml_add(hello, F_STONITH_OPERATION, CRM_OP_REGISTER); crm_xml_add(hello, F_STONITH_CLIENTNAME, name); rc = crm_ipc_send(native->ipc, hello, crm_ipc_client_response, -1, &reply); if(rc < 0) { crm_perror(LOG_DEBUG, "Couldn't complete registration with the fencing API: %d", rc); rc = -ECOMM; } else if(reply == NULL) { crm_err("Did not receive registration reply"); rc = -EPROTO; } else { const char *msg_type = crm_element_value(reply, F_STONITH_OPERATION); const char *tmp_ticket = crm_element_value(reply, F_STONITH_CLIENTID); if (safe_str_neq(msg_type, CRM_OP_REGISTER)) { crm_err("Invalid registration message: %s", msg_type); crm_log_xml_err(reply, "Bad reply"); rc = -EPROTO; } else if (tmp_ticket == NULL) { crm_err("No registration token provided"); crm_log_xml_err(reply, "Bad reply"); rc = -EPROTO; } else { crm_trace("Obtained registration token: %s", tmp_ticket); native->token = strdup(tmp_ticket); rc = pcmk_ok; } } free_xml(reply); free_xml(hello); } if (rc == pcmk_ok) { #if HAVE_MSGFROMIPC_TIMEOUT stonith->call_timeout = MAX_IPC_DELAY; #endif crm_debug("Connection to STONITH successful"); return pcmk_ok; } crm_debug("Connection to STONITH failed: %s", pcmk_strerror(rc)); stonith->cmds->disconnect(stonith); return rc; } static int stonith_set_notification(stonith_t * stonith, const char *callback, int enabled) { xmlNode *notify_msg = create_xml_node(NULL, __FUNCTION__); stonith_private_t *native = stonith->private; if (stonith->state != stonith_disconnected) { int rc; crm_xml_add(notify_msg, F_STONITH_OPERATION, T_STONITH_NOTIFY); if (enabled) { crm_xml_add(notify_msg, F_STONITH_NOTIFY_ACTIVATE, callback); } else { crm_xml_add(notify_msg, F_STONITH_NOTIFY_DEACTIVATE, callback); } rc = crm_ipc_send(native->ipc, notify_msg, crm_ipc_client_response, -1, NULL); if(rc < 0) { crm_perror(LOG_DEBUG, "Couldn't register for fencing notifications: %d", rc); rc = -ECOMM; } } free_xml(notify_msg); return pcmk_ok; } static int stonith_api_add_notification(stonith_t * stonith, const char *event, void (*callback) (stonith_t * stonith, stonith_event_t *e)) { GList *list_item = NULL; stonith_notify_client_t *new_client = NULL; stonith_private_t *private = NULL; private = stonith->private; crm_trace("Adding callback for %s events (%d)", event, g_list_length(private->notify_list)); new_client = calloc(1, sizeof(stonith_notify_client_t)); new_client->event = event; new_client->notify = callback; list_item = g_list_find_custom(private->notify_list, new_client, stonithlib_GCompareFunc); if (list_item != NULL) { crm_warn("Callback already present"); free(new_client); return -ENOTUNIQ; } else { private->notify_list = g_list_append(private->notify_list, new_client); stonith_set_notification(stonith, event, 1); crm_trace("Callback added (%d)", g_list_length(private->notify_list)); } return pcmk_ok; } static int stonith_api_del_notification(stonith_t * stonith, const char *event) { GList *list_item = NULL; stonith_notify_client_t *new_client = NULL; stonith_private_t *private = NULL; crm_debug("Removing callback for %s events", event); private = stonith->private; new_client = calloc(1, sizeof(stonith_notify_client_t)); new_client->event = event; new_client->notify = NULL; list_item = g_list_find_custom(private->notify_list, new_client, stonithlib_GCompareFunc); stonith_set_notification(stonith, event, 0); if (list_item != NULL) { stonith_notify_client_t *list_client = list_item->data; private->notify_list = g_list_remove(private->notify_list, list_client); free(list_client); crm_trace("Removed callback"); } else { crm_trace("Callback not present"); } free(new_client); return pcmk_ok; } static gboolean stonith_async_timeout_handler(gpointer data) { struct timer_rec_s *timer = data; crm_err("Async call %d timed out after %dms", timer->call_id, timer->timeout); stonith_perform_callback(timer->stonith, NULL, timer->call_id, -ETIME); /* Always return TRUE, never remove the handler * We do that in stonith_del_callback() */ return TRUE; } static void set_callback_timeout(stonith_callback_client_t *callback, stonith_t *stonith, int call_id, int timeout) { struct timer_rec_s *async_timer = callback->timer; if (timeout <= 0) { return; } if (!async_timer) { async_timer = calloc(1, sizeof(struct timer_rec_s)); callback->timer = async_timer; } async_timer->stonith = stonith; async_timer->call_id = call_id; /* Allow a fair bit of grace to allow the server to tell us of a timeout * This is only a fallback */ async_timer->timeout = (timeout + 60) * 1000; if (async_timer->ref) { g_source_remove(async_timer->ref); } async_timer->ref = g_timeout_add(async_timer->timeout, stonith_async_timeout_handler, async_timer); } static void update_callback_timeout(int call_id, int timeout, stonith_t *st) { stonith_callback_client_t *callback = NULL; stonith_private_t *private = st->private; callback = g_hash_table_lookup(private->stonith_op_callback_table, GINT_TO_POINTER(call_id)); if (!callback || !callback->allow_timeout_updates) { return; } set_callback_timeout(callback, st, call_id, timeout); } static void invoke_callback(stonith_t *st, int call_id, int rc, void *userdata, void (*callback) (stonith_t * st, stonith_callback_data_t *data)) { stonith_callback_data_t data = { 0, }; data.call_id = call_id; data.rc = rc; data.userdata = userdata; callback(st, &data); } static int stonith_api_add_callback(stonith_t * stonith, int call_id, int timeout, int options, void *user_data, const char *callback_name, void (*callback) (stonith_t * st, stonith_callback_data_t *data)) { stonith_callback_client_t *blob = NULL; stonith_private_t *private = NULL; CRM_CHECK(stonith != NULL, return -EINVAL); CRM_CHECK(stonith->private != NULL, return -EINVAL); private = stonith->private; if (call_id == 0) { private->op_callback = callback; } else if (call_id < 0) { if (!(options & st_opt_report_only_success)) { crm_trace("Call failed, calling %s: %s", callback_name, pcmk_strerror(call_id)); invoke_callback(stonith, call_id, call_id, user_data, callback); } else { crm_warn("STONITH call failed: %s", pcmk_strerror(call_id)); } return FALSE; } blob = calloc(1, sizeof(stonith_callback_client_t)); blob->id = callback_name; blob->only_success = (options & st_opt_report_only_success) ? TRUE : FALSE; blob->user_data = user_data; blob->callback = callback; blob->allow_timeout_updates = (options & st_opt_timeout_updates) ? TRUE : FALSE; if (timeout > 0) { set_callback_timeout(blob, stonith, call_id, timeout); } g_hash_table_insert(private->stonith_op_callback_table, GINT_TO_POINTER(call_id), blob); crm_trace("Added callback to %s for call %d", callback_name, call_id); return TRUE; } static int stonith_api_del_callback(stonith_t * stonith, int call_id, bool all_callbacks) { stonith_private_t *private = stonith->private; if (all_callbacks) { private->op_callback = NULL; g_hash_table_destroy(private->stonith_op_callback_table); private->stonith_op_callback_table = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, stonith_destroy_op_callback); } else if (call_id == 0) { private->op_callback = NULL; } else { g_hash_table_remove(private->stonith_op_callback_table, GINT_TO_POINTER(call_id)); } return pcmk_ok; } static void stonith_dump_pending_op(gpointer key, gpointer value, gpointer user_data) { int call = GPOINTER_TO_INT(key); stonith_callback_client_t *blob = value; crm_debug("Call %d (%s): pending", call, crm_str(blob->id)); } void stonith_dump_pending_callbacks(stonith_t * stonith) { stonith_private_t *private = stonith->private; if (private->stonith_op_callback_table == NULL) { return; } return g_hash_table_foreach(private->stonith_op_callback_table, stonith_dump_pending_op, NULL); } void stonith_perform_callback(stonith_t * stonith, xmlNode * msg, int call_id, int rc) { stonith_private_t *private = NULL; stonith_callback_client_t *blob = NULL; stonith_callback_client_t local_blob; CRM_CHECK(stonith != NULL, return); CRM_CHECK(stonith->private != NULL, return); private = stonith->private; local_blob.id = NULL; local_blob.callback = NULL; local_blob.user_data = NULL; local_blob.only_success = FALSE; if (msg != NULL) { crm_element_value_int(msg, F_STONITH_RC, &rc); crm_element_value_int(msg, F_STONITH_CALLID, &call_id); } CRM_CHECK(call_id > 0, crm_log_xml_err(msg, "Bad result")); blob = g_hash_table_lookup(private->stonith_op_callback_table, GINT_TO_POINTER(call_id)); if (blob != NULL) { local_blob = *blob; blob = NULL; stonith_api_del_callback(stonith, call_id, FALSE); } else { crm_trace("No callback found for call %d", call_id); local_blob.callback = NULL; } if (local_blob.callback != NULL && (rc == pcmk_ok || local_blob.only_success == FALSE)) { crm_trace("Invoking callback %s for call %d", crm_str(local_blob.id), call_id); invoke_callback(stonith, call_id, rc, local_blob.user_data, local_blob.callback); } else if (private->op_callback == NULL && rc != pcmk_ok) { crm_warn("STONITH command failed: %s", pcmk_strerror(rc)); crm_log_xml_debug(msg, "Failed STONITH Update"); } if (private->op_callback != NULL) { crm_trace("Invoking global callback for call %d", call_id); invoke_callback(stonith, call_id, rc, NULL, private->op_callback); } crm_trace("OP callback activated."); } /* */ static stonith_event_t * xml_to_event(xmlNode *msg) { stonith_event_t *event = calloc(1, sizeof(stonith_event_t)); const char *ntype = crm_element_value(msg, F_SUBTYPE); char *data_addr = g_strdup_printf("//%s", ntype); xmlNode *data = get_xpath_object(data_addr, msg, LOG_DEBUG); crm_log_xml_trace(msg, "stonith_notify"); crm_element_value_int(msg, F_STONITH_RC, &(event->result)); if(safe_str_eq(ntype, T_STONITH_NOTIFY_FENCE)) { event->operation = crm_element_value_copy(msg, F_STONITH_OPERATION); if(data) { event->origin = crm_element_value_copy(data, F_STONITH_ORIGIN); event->action = crm_element_value_copy(data, F_STONITH_ACTION); event->target = crm_element_value_copy(data, F_STONITH_TARGET); event->executioner = crm_element_value_copy(data, F_STONITH_DELEGATE); event->id = crm_element_value_copy(data, F_STONITH_REMOTE_OP_ID); event->client_origin = crm_element_value_copy(data, F_STONITH_CLIENTNAME); } else { crm_err("No data for %s event", ntype); crm_log_xml_notice(msg, "BadEvent"); } } g_free(data_addr); return event; } static void event_free(stonith_event_t *event) { free(event->id); free(event->type); free(event->message); free(event->operation); free(event->origin); free(event->action); free(event->target); free(event->executioner); free(event->device); free(event->client_origin); free(event); } static void stonith_send_notification(gpointer data, gpointer user_data) { struct notify_blob_s *blob = user_data; stonith_notify_client_t *entry = data; stonith_event_t *st_event = NULL; const char *event = NULL; if (blob->xml == NULL) { crm_warn("Skipping callback - NULL message"); return; } event = crm_element_value(blob->xml, F_SUBTYPE); if (entry == NULL) { crm_warn("Skipping callback - NULL callback client"); return; } else if (entry->notify == NULL) { crm_warn("Skipping callback - NULL callback"); return; } else if (safe_str_neq(entry->event, event)) { crm_trace("Skipping callback - event mismatch %p/%s vs. %s", entry, entry->event, event); return; } st_event = xml_to_event(blob->xml); crm_trace("Invoking callback for %p/%s event...", entry, event); entry->notify(blob->stonith, st_event); crm_trace("Callback invoked..."); event_free(st_event); } int stonith_send_command(stonith_t * stonith, const char *op, xmlNode * data, xmlNode ** output_data, int call_options, int timeout) { int rc = 0; int reply_id = -1; enum crm_ipc_flags ipc_flags = crm_ipc_client_none; xmlNode *op_msg = NULL; xmlNode *op_reply = NULL; stonith_private_t *native = stonith->private; if (stonith->state == stonith_disconnected) { return -ENOTCONN; } if (output_data != NULL) { *output_data = NULL; } if (op == NULL) { crm_err("No operation specified"); return -EINVAL; } if (call_options & st_opt_sync_call) { ipc_flags |= crm_ipc_client_response; } stonith->call_id++; /* prevent call_id from being negative (or zero) and conflicting * with the stonith_errors enum * use 2 because we use it as (stonith->call_id - 1) below */ if (stonith->call_id < 1) { stonith->call_id = 1; } CRM_CHECK(native->token != NULL,;); op_msg = stonith_create_op(stonith->call_id, native->token, op, data, call_options); if (op_msg == NULL) { return -EINVAL; } crm_xml_add_int(op_msg, F_STONITH_TIMEOUT, timeout); crm_trace("Sending %s message to STONITH service, Timeout: %ds", op, timeout); rc = crm_ipc_send(native->ipc, op_msg, ipc_flags, 1000*(timeout + 60), &op_reply); free_xml(op_msg); if(rc < 0) { crm_perror(LOG_ERR, "Couldn't perform %s operation (timeout=%ds): %d", op, timeout, rc); rc = -ECOMM; goto done; } crm_log_xml_trace(op_reply, "Reply"); if (!(call_options & st_opt_sync_call)) { crm_trace("Async call %d, returning", stonith->call_id); CRM_CHECK(stonith->call_id != 0, return -EPROTO); free_xml(op_reply); return stonith->call_id; } rc = pcmk_ok; crm_element_value_int(op_reply, F_STONITH_CALLID, &reply_id); if (reply_id == stonith->call_id) { crm_trace("Syncronous reply %d received", reply_id); if (crm_element_value_int(op_reply, F_STONITH_RC, &rc) != 0) { rc = -ENOMSG; } if ((call_options & st_opt_discard_reply) || output_data == NULL) { crm_trace("Discarding reply"); } else { *output_data = op_reply; op_reply = NULL; /* Prevent subsequent free */ } } else if (reply_id <= 0) { crm_err("Recieved bad reply: No id set"); crm_log_xml_err(op_reply, "Bad reply"); free_xml(op_reply); rc = -ENOMSG; } else { crm_err("Recieved bad reply: %d (wanted %d)", reply_id, stonith->call_id); crm_log_xml_err(op_reply, "Old reply"); free_xml(op_reply); rc = -ENOMSG; } done: if (crm_ipc_connected(native->ipc) == FALSE) { crm_err("STONITH disconnected"); stonith->state = stonith_disconnected; } free_xml(op_reply); return rc; } /* Not used with mainloop */ bool stonith_dispatch(stonith_t * st) { gboolean stay_connected = TRUE; stonith_private_t *private = NULL; CRM_ASSERT(st != NULL); private = st->private; while(crm_ipc_ready(private->ipc)) { if(crm_ipc_read(private->ipc) > 0) { const char *msg = crm_ipc_buffer(private->ipc); stonith_dispatch_internal(msg, strlen(msg), st); } if(crm_ipc_connected(private->ipc) == FALSE) { crm_err("Connection closed"); stay_connected = FALSE; } } return stay_connected; } int stonith_dispatch_internal(const char *buffer, ssize_t length, gpointer userdata) { const char *type = NULL; struct notify_blob_s blob; stonith_t * st = userdata; stonith_private_t *private = NULL; CRM_ASSERT(st != NULL); private = st->private; blob.stonith = st; blob.xml = string2xml(buffer); if (blob.xml == NULL) { crm_warn("Received a NULL msg from STONITH service: %s.", buffer); return 0; } /* do callbacks */ type = crm_element_value(blob.xml, F_TYPE); crm_trace("Activating %s callbacks...", type); if (safe_str_eq(type, T_STONITH_NG)) { stonith_perform_callback(st, blob.xml, 0, 0); } else if (safe_str_eq(type, T_STONITH_NOTIFY)) { g_list_foreach(private->notify_list, stonith_send_notification, &blob); } else if (safe_str_eq(type, T_STONITH_TIMEOUT_VALUE)) { int call_id = 0; int timeout = 0; crm_element_value_int(blob.xml, F_STONITH_TIMEOUT, &timeout); crm_element_value_int(blob.xml, F_STONITH_CALLID, &call_id); update_callback_timeout(call_id, timeout, st); } else { crm_err("Unknown message type: %s", type); crm_log_xml_warn(blob.xml, "BadReply"); } free_xml(blob.xml); return 1; } static int stonith_api_free(stonith_t * stonith) { int rc = pcmk_ok; if (stonith->state != stonith_disconnected) { rc = stonith->cmds->disconnect(stonith); } if (stonith->state == stonith_disconnected) { stonith_private_t *private = stonith->private; g_hash_table_destroy(private->stonith_op_callback_table); free(private->token); free(stonith->private); free(stonith->cmds); free(stonith); } return rc; } void stonith_api_delete(stonith_t * stonith) { stonith_private_t *private = stonith->private; GList *list = private->notify_list; while (list != NULL) { stonith_notify_client_t *client = g_list_nth_data(list, 0); list = g_list_remove(list, client); free(client); } stonith->cmds->free(stonith); stonith = NULL; } stonith_t * stonith_api_new(void) { stonith_t *new_stonith = NULL; stonith_private_t *private = NULL; new_stonith = calloc(1, sizeof(stonith_t)); private = calloc(1, sizeof(stonith_private_t)); new_stonith->private = private; private->stonith_op_callback_table = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, stonith_destroy_op_callback); private->notify_list = NULL; new_stonith->call_id = 1; new_stonith->state = stonith_disconnected; new_stonith->cmds = calloc(1, sizeof(stonith_api_operations_t)); /* *INDENT-OFF* */ new_stonith->cmds->free = stonith_api_free; new_stonith->cmds->connect = stonith_api_signon; new_stonith->cmds->disconnect = stonith_api_signoff; new_stonith->cmds->list = stonith_api_list; new_stonith->cmds->monitor = stonith_api_monitor; new_stonith->cmds->status = stonith_api_status; new_stonith->cmds->fence = stonith_api_fence; new_stonith->cmds->confirm = stonith_api_confirm; new_stonith->cmds->history = stonith_api_history; new_stonith->cmds->list_agents = stonith_api_device_list; new_stonith->cmds->metadata = stonith_api_device_metadata; new_stonith->cmds->query = stonith_api_query; new_stonith->cmds->remove_device = stonith_api_remove_device; new_stonith->cmds->register_device = stonith_api_register_device; new_stonith->cmds->remove_level = stonith_api_remove_level; new_stonith->cmds->register_level = stonith_api_register_level; new_stonith->cmds->remove_callback = stonith_api_del_callback; new_stonith->cmds->register_callback = stonith_api_add_callback; new_stonith->cmds->remove_notification = stonith_api_del_notification; new_stonith->cmds->register_notification = stonith_api_add_notification; /* *INDENT-ON* */ return new_stonith; } stonith_key_value_t * stonith_key_value_add(stonith_key_value_t * head, const char *key, const char *value) { stonith_key_value_t *p, *end; p = calloc(1, sizeof(stonith_key_value_t)); if (key) { p->key = strdup(key); } if (value) { p->value = strdup(value); } end = head; while (end && end->next) { end = end->next; } if (end) { end->next = p; } else { head = p; } return head; } void stonith_key_value_freeall(stonith_key_value_t * head, int keys, int values) { stonith_key_value_t *p; while (head) { p = head->next; if (keys) { free(head->key); } if (values) { free(head->value); } free(head); head = p; } } int stonith_api_kick(int nodeid, const char *uname, int timeout, bool off) { char *name = NULL; const char *action = "reboot"; int rc = -EPROTO; stonith_t *st = NULL; enum stonith_call_options opts = st_opt_sync_call | st_opt_allow_suicide; st = stonith_api_new(); if (st) { rc = st->cmds->connect(st, "stonith-api", NULL); } if (uname != NULL) { name = strdup(uname); } else if (nodeid > 0) { opts |= st_opt_cs_nodeid; name = crm_itoa(nodeid); } if (off) { action = "off"; } if (rc == pcmk_ok) { rc = st->cmds->fence(st, opts, name, action, timeout, 0); } if (st) { st->cmds->disconnect(st); stonith_api_delete(st); } free(name); return rc; } time_t stonith_api_time(int nodeid, const char *uname, bool in_progress) { int rc = 0; char *name = NULL; time_t when = 0; time_t progress = 0; stonith_t *st = NULL; stonith_history_t *history, *hp = NULL; enum stonith_call_options opts = st_opt_sync_call; st = stonith_api_new(); if (st) { rc = st->cmds->connect(st, "stonith-api", NULL); } if (uname != NULL) { name = strdup(uname); } else if (nodeid > 0) { opts |= st_opt_cs_nodeid; name = crm_itoa(nodeid); } if (st && rc == pcmk_ok) { st->cmds->history(st, st_opt_sync_call | st_opt_cs_nodeid, name, &history, 120); for (hp = history; hp; hp = hp->next) { if (in_progress) { if (hp->state != st_done && hp->state != st_failed) { progress = time(NULL); } } else if (hp->state == st_done) { when = hp->completed; } } } if (progress) { when = progress; } if (st) { st->cmds->disconnect(st); stonith_api_delete(st); } free(name); return when; } + +#if HAVE_STONITH_STONITH_H +#include + +const char *i_hate_pils(int rc); + +const char * +i_hate_pils(int rc) +{ + return PIL_strerror(rc); +} +#endif diff --git a/mcp/pacemaker.c b/mcp/pacemaker.c index cfa762e323..6cc64c5282 100644 --- a/mcp/pacemaker.c +++ b/mcp/pacemaker.c @@ -1,858 +1,876 @@ /* * Copyright (C) 2010 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include gboolean fatal_error = FALSE; GMainLoop *mainloop = NULL; GHashTable *client_list = NULL; GHashTable *peers = NULL; char *local_name = NULL; uint32_t local_nodeid = 0; crm_trigger_t *shutdown_trigger = NULL; const char *pid_file = "/var/run/pacemaker.pid"; /* *INDENT-OFF* */ enum crm_proc_flag { crm_proc_none = 0x00000001, crm_proc_plugin = 0x00000002, crm_proc_lrmd = 0x00000010, crm_proc_cib = 0x00000100, crm_proc_crmd = 0x00000200, crm_proc_attrd = 0x00001000, crm_proc_stonithd = 0x00002000, crm_proc_pe = 0x00010000, crm_proc_te = 0x00020000, crm_proc_mgmtd = 0x00040000, crm_proc_stonith_ng = 0x00100000, }; /* *INDENT-ON* */ typedef struct pcmk_child_s { int pid; long flag; int start_seq; int respawn_count; gboolean respawn; const char *name; const char *uid; const char *command; } pcmk_child_t; /* Index into the array below */ #define pcmk_child_crmd 4 #define pcmk_child_mgmtd 8 /* *INDENT-OFF* */ static pcmk_child_t pcmk_children[] = { { 0, crm_proc_none, 0, 0, FALSE, "none", NULL, NULL }, { 0, crm_proc_plugin, 0, 0, FALSE, "ais", NULL, NULL }, { 0, crm_proc_lrmd, 3, 0, TRUE, "lrmd", NULL, CRM_DAEMON_DIR"/lrmd" }, { 0, crm_proc_cib, 1, 0, TRUE, "cib", CRM_DAEMON_USER, CRM_DAEMON_DIR"/cib" }, { 0, crm_proc_crmd, 6, 0, TRUE, "crmd", CRM_DAEMON_USER, CRM_DAEMON_DIR"/crmd" }, { 0, crm_proc_attrd, 4, 0, TRUE, "attrd", CRM_DAEMON_USER, CRM_DAEMON_DIR"/attrd" }, { 0, crm_proc_stonithd, 0, 0, TRUE, "stonithd", NULL, NULL }, { 0, crm_proc_pe, 5, 0, TRUE, "pengine", CRM_DAEMON_USER, CRM_DAEMON_DIR"/pengine" }, { 0, crm_proc_mgmtd, 0, 0, TRUE, "mgmtd", NULL, HB_DAEMON_DIR"/mgmtd" }, { 0, crm_proc_stonith_ng, 2, 0, TRUE, "stonith-ng", NULL, CRM_DAEMON_DIR"/stonithd" }, }; /* *INDENT-ON* */ static gboolean start_child(pcmk_child_t * child); void enable_crmd_as_root(gboolean enable) { if (enable) { pcmk_children[pcmk_child_crmd].uid = NULL; } else { pcmk_children[pcmk_child_crmd].uid = CRM_DAEMON_USER; } } void enable_mgmtd(gboolean enable) { if (enable) { pcmk_children[pcmk_child_mgmtd].start_seq = 7; } else { pcmk_children[pcmk_child_mgmtd].start_seq = 0; } } static uint32_t get_process_list(void) { int lpc = 0; uint32_t procs = crm_proc_plugin; for (lpc = 0; lpc < SIZEOF(pcmk_children); lpc++) { if (pcmk_children[lpc].pid != 0) { procs |= pcmk_children[lpc].flag; } } return procs; } static void pcmk_child_exit(GPid pid, gint status, gpointer user_data) { int exitcode = 0; pcmk_child_t *child = user_data; if(WIFSIGNALED(status)) { int signo = WTERMSIG(status); int core = WCOREDUMP(status); crm_notice("Child process %s terminated with signal %d (pid=%d, core=%d)", child->name, signo, child->pid, core); } else if(WIFEXITED(status)) { exitcode = WEXITSTATUS(status); do_crm_log(exitcode == 0 ? LOG_INFO : LOG_ERR, "Child process %s exited (pid=%d, rc=%d)", child->name, child->pid, exitcode); } child->pid = 0; if (exitcode == 100) { crm_warn("Pacemaker child process %s no longer wishes to be respawned. " "Shutting ourselves down.", child->name); child->respawn = FALSE; fatal_error = TRUE; pcmk_shutdown(15); } /* Broadcast the fact that one of our processes died ASAP * * Try to get some logging of the cause out first though * because we're probably about to get fenced * * Potentially do this only if respawn_count > N * to allow for local recovery */ update_node_processes(local_nodeid, NULL, get_process_list()); child->respawn_count += 1; if (child->respawn_count > MAX_RESPAWN) { crm_err("Child respawn count exceeded by %s", child->name); child->respawn = FALSE; } if (shutdown_trigger) { mainloop_set_trigger(shutdown_trigger); update_node_processes(local_nodeid, NULL, get_process_list()); } else if (child->respawn) { crm_notice("Respawning failed child process: %s", child->name); start_child(child); } } static gboolean stop_child(pcmk_child_t * child, int signal) { if (signal == 0) { signal = SIGTERM; } if (child->command == NULL) { crm_debug("Nothing to do for child \"%s\"", child->name); return TRUE; } if (child->pid <= 0) { crm_trace("Client %s not running", child->name); return TRUE; } errno = 0; if (kill(child->pid, signal) == 0) { crm_notice("Stopping %s: Sent -%d to process %d", child->name, signal, child->pid); } else { crm_perror(LOG_ERR, "Stopping %s: Could not send -%d to process %d failed", child->name, signal, child->pid); } return TRUE; } static char *opts_default[] = { NULL, NULL }; static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL }; static gboolean start_child(pcmk_child_t * child) { int lpc = 0; uid_t uid = 0; struct rlimit oflimits; gboolean use_valgrind = FALSE; gboolean use_callgrind = FALSE; const char *devnull = "/dev/null"; const char *env_valgrind = getenv("PCMK_valgrind_enabled"); const char *env_callgrind = getenv("PCMK_callgrind_enabled"); if (child->command == NULL) { crm_info("Nothing to do for child \"%s\"", child->name); return TRUE; } if (env_callgrind != NULL && crm_is_true(env_callgrind)) { use_callgrind = TRUE; use_valgrind = TRUE; } else if (env_callgrind != NULL && strstr(env_callgrind, child->name)) { use_callgrind = TRUE; use_valgrind = TRUE; } else if (env_valgrind != NULL && crm_is_true(env_valgrind)) { use_valgrind = TRUE; } else if (env_valgrind != NULL && strstr(env_valgrind, child->name)) { use_valgrind = TRUE; } if (use_valgrind && strlen(VALGRIND_BIN) == 0) { crm_warn("Cannot enable valgrind for %s:" " The location of the valgrind binary is unknown", child->name); use_valgrind = FALSE; } child->pid = fork(); CRM_ASSERT(child->pid != -1); if (child->pid > 0) { /* parent */ g_child_watch_add(child->pid, pcmk_child_exit, child); crm_info("Forked child %d for process %s%s", child->pid, child->name, use_valgrind ? " (valgrind enabled: " VALGRIND_BIN ")" : ""); update_node_processes(local_nodeid, NULL, get_process_list()); return TRUE; } else { /* Start a new session */ (void)setsid(); /* Setup the two alternate arg arrarys */ opts_vgrind[0] = strdup(VALGRIND_BIN); if (use_callgrind) { opts_vgrind[1] = strdup("--tool=callgrind"); opts_vgrind[2] = strdup("--callgrind-out-file=" CRM_STATE_DIR "/callgrind.out.%p"); opts_vgrind[3] = strdup(child->command); opts_vgrind[4] = NULL; } else { opts_vgrind[1] = strdup(child->command); opts_vgrind[2] = NULL; opts_vgrind[3] = NULL; opts_vgrind[4] = NULL; } opts_default[0] = strdup(child->command);; #if 0 /* Dont set the group for now - it prevents connection to the cluster */ if (gid && setgid(gid) < 0) { crm_perror("Could not set group to %d", gid); } #endif if (child->uid) { if (crm_user_lookup(child->uid, &uid, NULL) < 0) { crm_err("Invalid uid (%s) specified for %s", child->uid, child->name); return TRUE; } } if (uid && setuid(uid) < 0) { crm_perror(LOG_ERR, "Could not set user to %d (%s)", uid, child->uid); } /* Close all open file descriptors */ getrlimit(RLIMIT_NOFILE, &oflimits); for (lpc = 0; lpc < oflimits.rlim_cur; lpc++) { close(lpc); } (void)open(devnull, O_RDONLY); /* Stdin: fd 0 */ (void)open(devnull, O_WRONLY); /* Stdout: fd 1 */ (void)open(devnull, O_WRONLY); /* Stderr: fd 2 */ if (use_valgrind) { (void)execvp(VALGRIND_BIN, opts_vgrind); } else { (void)execvp(child->command, opts_default); } crm_perror(LOG_ERR, "FATAL: Cannot exec %s", child->command); exit(100); } return TRUE; /* never reached */ } static gboolean escalate_shutdown(gpointer data) { pcmk_child_t *child = data; if (child->pid) { /* Use SIGSEGV instead of SIGKILL to create a core so we can see what it was up to */ crm_err("Child %s not terminating in a timely manner, forcing", child->name); stop_child(child, SIGSEGV); } return FALSE; } static gboolean pcmk_shutdown_worker(gpointer user_data) { static int phase = 0; static time_t next_log = 0; static int max = SIZEOF(pcmk_children); int lpc = 0; if (phase == 0) { crm_notice("Shuting down Pacemaker"); phase = max; } for (; phase > 0; phase--) { /* dont stop anything with start_seq < 1 */ for (lpc = max - 1; lpc >= 0; lpc--) { pcmk_child_t *child = &(pcmk_children[lpc]); if (phase != child->start_seq) { continue; } if (child->pid) { time_t now = time(NULL); if (child->respawn) { next_log = now + 30; child->respawn = FALSE; stop_child(child, SIGTERM); if (phase < pcmk_children[pcmk_child_crmd].start_seq) { g_timeout_add(180000 /* 3m */ , escalate_shutdown, child); } } else if (now >= next_log) { next_log = now + 30; crm_notice("Still waiting for %s (pid=%d, seq=%d) to terminate...", child->name, child->pid, child->start_seq); } return TRUE; } /* cleanup */ crm_debug("%s confirmed stopped", child->name); child->pid = 0; } } /* send_cluster_id(); */ crm_notice("Shutdown complete"); g_main_loop_quit(mainloop); if(fatal_error) { crm_notice("Attempting to inhibit respawning after fatal error"); exit(100); } return TRUE; } void pcmk_shutdown(int nsig) { if (shutdown_trigger == NULL) { shutdown_trigger = mainloop_add_trigger(G_PRIORITY_HIGH, pcmk_shutdown_worker, NULL); } mainloop_set_trigger(shutdown_trigger); } static void build_path(const char *path_c, mode_t mode) { int offset = 1, len = 0; char *path = strdup(path_c); CRM_CHECK(path != NULL, return); for (len = strlen(path); offset < len; offset++) { if (path[offset] == '/') { path[offset] = 0; if (mkdir(path, mode) < 0 && errno != EEXIST) { crm_perror(LOG_ERR, "Could not create directory '%s'", path); break; } path[offset] = '/'; } } if (mkdir(path, mode) < 0 && errno != EEXIST) { crm_perror(LOG_ERR, "Could not create directory '%s'", path); } free(path); } static int32_t pcmk_ipc_accept(qb_ipcs_connection_t *c, uid_t uid, gid_t gid) { crm_trace("Connecting %p for uid=%d gid=%d", c, uid, gid); return 0; } static void pcmk_ipc_created(qb_ipcs_connection_t *c) { g_hash_table_insert(client_list, c, c); crm_debug("Channel %p connected: %d children", c, g_hash_table_size(client_list)); /* update_process_clients(); */ } /* Exit code means? */ static int32_t pcmk_ipc_dispatch(qb_ipcs_connection_t *c, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; const char *task = NULL; xmlNode *msg = crm_ipcs_recv(c, data, size, &id, &flags); crm_trace("Message from %p", c); if(flags & crm_ipc_client_response) { crm_ipcs_send_ack(c, id, "ack", __FUNCTION__, __LINE__); } if (msg == NULL) { return 0; } task = crm_element_value(msg, F_CRM_TASK); if(crm_str_eq(task, CRM_OP_QUIT, TRUE)) { /* Time to quit */ crm_notice("Shutting down in responce to ticket %s (%s)", crm_element_value(msg, XML_ATTR_REFERENCE), crm_element_value(msg, F_CRM_ORIGIN)); pcmk_shutdown(15); } else { /* Just send to everyone */ update_process_clients(); } free_xml(msg); return 0; } /* Error code means? */ static int32_t pcmk_ipc_closed(qb_ipcs_connection_t *c) { crm_trace("%p closed", c); return 0; } static void pcmk_ipc_destroy(qb_ipcs_connection_t *c) { crm_trace("%p destroy", c); g_hash_table_remove(client_list, c); } struct qb_ipcs_service_handlers ipc_callbacks = { .connection_accept = pcmk_ipc_accept, .connection_created = pcmk_ipc_created, .msg_process = pcmk_ipc_dispatch, .connection_closed = pcmk_ipc_closed, .connection_destroyed = pcmk_ipc_destroy }; static gboolean ghash_send_proc_details(gpointer key, gpointer value, gpointer data) { if (crm_ipcs_send(key, 0, data, TRUE) <= 0) { /* remove it */ return TRUE; } return FALSE; } static void peer_loop_fn(gpointer key, gpointer value, gpointer user_data) { pcmk_peer_t *node = value; xmlNode *update = user_data; xmlNode *xml = create_xml_node(update, "node"); crm_xml_add_int(xml, "id", node->id); crm_xml_add(xml, "uname", node->uname); crm_xml_add_int(xml, "processes", node->processes); } void update_process_clients(void) { xmlNode *update = create_xml_node(NULL, "nodes"); crm_trace("Sending process list to %d children", g_hash_table_size(client_list)); g_hash_table_foreach(peers, peer_loop_fn, update); g_hash_table_foreach_remove(client_list, ghash_send_proc_details, update); free_xml(update); } void update_process_peers(void) { char buffer[1024]; struct iovec iov; int rc = 0; memset(buffer, 0, SIZEOF(buffer)); if (local_name) { rc = snprintf(buffer, SIZEOF(buffer) - 1, "", local_name, get_process_list()); } else { rc = snprintf(buffer, SIZEOF(buffer) - 1, "", get_process_list()); } iov.iov_base = buffer; iov.iov_len = rc + 1; crm_trace("Sending %s", buffer); send_cpg_message(&iov); } gboolean update_node_processes(uint32_t id, const char *uname, uint32_t procs) { gboolean changed = FALSE; pcmk_peer_t *node = g_hash_table_lookup(peers, GUINT_TO_POINTER(id)); if (node == NULL) { changed = TRUE; node = calloc(1, sizeof(pcmk_peer_t)); node->id = id; g_hash_table_insert(peers, GUINT_TO_POINTER(id), node); node = g_hash_table_lookup(peers, GUINT_TO_POINTER(id)); CRM_ASSERT(node != NULL); } if (uname != NULL) { if (node->uname == NULL || safe_str_eq(node->uname, uname) == FALSE) { int lpc, len = strlen(uname); crm_notice("%p Node %u now known as %s%s%s", node, id, uname, node->uname?node->uname:", was: ", node->uname?node->uname:""); free(node->uname); node->uname = strdup(uname); changed = TRUE; for(lpc = 0; lpc < len; lpc++) { if(uname[lpc] >= 'A' && uname[lpc] <= 'Z') { crm_warn("Node names with capitals are discouraged, consider changing '%s' to something else", uname); break; } } } } else { crm_trace("Empty uname for node %u", id); } if (procs != 0) { if(procs != node->processes) { crm_debug("Node %s now has process list: %.32x (was %.32x)", node->uname, procs, node->processes); node->processes = procs; changed = TRUE; } else { crm_trace("Node %s still has process list: %.32x", node->uname, procs); } } if (changed && id == local_nodeid) { update_process_clients(); update_process_peers(); } return changed; } /* *INDENT-OFF* */ static struct crm_option long_options[] = { /* Top-level Options */ {"help", 0, 0, '?', "\tThis text"}, {"version", 0, 0, '$', "\tVersion information" }, {"verbose", 0, 0, 'V', "\tIncrease debug output"}, {"shutdown", 0, 0, 'S', "\tInstruct Pacemaker to shutdown on this machine"}, {"features", 0, 0, 'F', "\tDisplay the full version and list of features Pacemaker was built with"}, {"-spacer-", 1, 0, '-', "\nAdditional Options:"}, {"foreground", 0, 0, 'f', "\tRun in the foreground instead of as a daemon"}, {"pid-file", 1, 0, 'p', "\t(Advanced) Daemon pid file location"}, {NULL, 0, 0, 0} }; /* *INDENT-ON* */ +static void +mcp_chown(const char *path, uid_t uid, gid_t gid) +{ + int rc = chown(path, uid, gid); + if(rc < 0) { + crm_warn("Cannot change the ownership of %s to user %s and gid %d: %s", + path, CRM_DAEMON_USER, gid, pcmk_strerror(errno)); + } +} + int main(int argc, char **argv) { int rc; int flag; int argerr = 0; int option_index = 0; gboolean shutdown = FALSE; int start_seq = 1, lpc = 0; static int max = SIZEOF(pcmk_children); uid_t pcmk_uid = 0; gid_t pcmk_gid = 0; struct rlimit cores; crm_ipc_t *old_instance = NULL; qb_ipcs_service_t *ipcs = NULL; const char *facility = daemon_option("logfacility"); set_daemon_option("mcp", "true"); set_daemon_option("use_logd", "off"); crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE); crm_set_options(NULL, "mode [options]", long_options, "Start/Stop Pacemaker\n"); /* Restore the original facility so that read_config() does the right thing */ set_daemon_option("logfacility", facility); while (1) { flag = crm_get_option(argc, argv, &option_index); if (flag == -1) break; switch (flag) { case 'V': crm_bump_log_level(argc, argv); break; case 'f': /* Legacy */ break; case 'p': pid_file = optarg; break; case '$': case '?': crm_help(flag, EX_OK); break; case 'S': shutdown = TRUE; break; case 'F': printf("Pacemaker %s (Build: %s)\n Supporting: %s\n", VERSION, BUILD_VERSION, CRM_FEATURES); exit(0); default: printf("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag); ++argerr; break; } } if (optind < argc) { printf("non-option ARGV-elements: "); while (optind < argc) printf("%s ", argv[optind++]); printf("\n"); } if (argerr) { crm_help('?', EX_USAGE); } crm_debug("Checking for old instances of %s", CRM_SYSTEM_MCP); old_instance = crm_ipc_new(CRM_SYSTEM_MCP, 0); crm_ipc_connect(old_instance); if(shutdown) { crm_debug("Terminating previous instance"); while (crm_ipc_connected(old_instance)) { xmlNode *cmd = create_request(CRM_OP_QUIT, NULL, NULL, CRM_SYSTEM_MCP, CRM_SYSTEM_MCP, NULL); crm_debug("."); crm_ipc_send(old_instance, cmd, 0, 0, NULL); free_xml(cmd); sleep(2); } crm_ipc_close(old_instance); crm_ipc_destroy(old_instance); exit(0); } else if(crm_ipc_connected(old_instance)) { crm_ipc_close(old_instance); crm_ipc_destroy(old_instance); crm_err("Pacemaker is already active, aborting startup"); exit(100); } crm_ipc_close(old_instance); crm_ipc_destroy(old_instance); if (read_config() == FALSE) { crm_notice("Could not obtain corosync config data, exiting"); return 1; } crm_notice("Starting Pacemaker %s (Build: %s): %s", VERSION, BUILD_VERSION, CRM_FEATURES); mainloop = g_main_new(FALSE); rc = getrlimit(RLIMIT_CORE, &cores); if (rc < 0) { crm_perror(LOG_ERR, "Cannot determine current maximum core size."); } else { if (cores.rlim_max == 0 && geteuid() == 0) { cores.rlim_max = RLIM_INFINITY; } else { crm_info("Maximum core file size is: %lu", (unsigned long)cores.rlim_max); } cores.rlim_cur = cores.rlim_max; rc = setrlimit(RLIMIT_CORE, &cores); if (rc < 0) { crm_perror(LOG_ERR, "Core file generation will remain disabled." " Core files are an important diagnositic tool," " please consider enabling them by default."); } #if 0 /* system() is not thread-safe, can't call from here * Actually, its a pretty hacky way to try and achieve this anyway */ if (system("echo 1 > /proc/sys/kernel/core_uses_pid") != 0) { crm_perror(LOG_ERR, "Could not enable /proc/sys/kernel/core_uses_pid"); } #endif } if (crm_user_lookup(CRM_DAEMON_USER, &pcmk_uid, &pcmk_gid) < 0) { crm_err("Cluster user %s does not exist, aborting Pacemaker startup", CRM_DAEMON_USER); return TRUE; } mkdir(CRM_STATE_DIR, 0750); - rc = chown(CRM_STATE_DIR, pcmk_uid, pcmk_gid); - if(rc < 0) { - crm_warn("Cannot change the ownership of %s to user %s and gid %d", - CRM_STATE_DIR, CRM_DAEMON_USER, pcmk_gid); - } + mcp_chown(CRM_STATE_DIR, pcmk_uid, pcmk_gid); /* Used by stonithd */ build_path(HA_STATE_DIR "/heartbeat", 0755); + mcp_chown(HA_STATE_DIR, pcmk_uid, pcmk_gid); /* Used by RAs - Leave owned by root */ build_path(CRM_RSCTMP_DIR, 0755); /* Used to store core files in */ build_path(CRM_CORE_DIR, 0755); + mcp_chown(CRM_CORE_DIR, pcmk_uid, pcmk_gid); + + /* Used to store blackbox dumps in */ + build_path(CRM_BLACKBOX_DIR, 0755); + mcp_chown(CRM_BLACKBOX_DIR, pcmk_uid, pcmk_gid); + + /* Used to store policy engine inputs in */ + build_path(PE_STATE_DIR, 0755); + mcp_chown(PE_STATE_DIR, pcmk_uid, pcmk_gid); + + /* Used to store the cluster configuration */ + build_path(CRM_CONFIG_DIR, 0755); + mcp_chown(CRM_CONFIG_DIR, pcmk_uid, pcmk_gid); /* Per-user core directories */ if (mkdir(CRM_CORE_DIR"/root", 0700) < 0 && errno != EEXIST) { crm_perror(LOG_INFO, "Could not create %s", CRM_CORE_DIR"/root"); } if (mkdir(CRM_CORE_DIR"/"CRM_DAEMON_USER, 0700) < 0 && errno != EEXIST) { crm_perror(LOG_INFO, "Could not create %s", CRM_CORE_DIR"/"CRM_DAEMON_USER); } else { - if(chown(CRM_CORE_DIR"/"CRM_DAEMON_USER, pcmk_uid, pcmk_gid) < 0) { - crm_perror(LOG_ERR, "Could not change the ownership of %s to %s", CRM_CORE_DIR"/"CRM_DAEMON_USER, CRM_DAEMON_USER); - } + mcp_chown(CRM_CORE_DIR, pcmk_uid, pcmk_gid); } client_list = g_hash_table_new(g_direct_hash, g_direct_equal); peers = g_hash_table_new(g_direct_hash, g_direct_equal); ipcs = mainloop_add_ipc_server(CRM_SYSTEM_MCP, QB_IPC_NATIVE, &ipc_callbacks); if (ipcs == NULL) { crm_err("Couldn't start IPC server"); return 1; } if (cluster_connect_cfg(&local_nodeid) == FALSE) { crm_err("Couldn't connect to Corosync's CFG service"); return 1; } if (cluster_connect_cpg() == FALSE) { crm_err("Couldn't connect to Corosync's CPG service"); return 1; } local_name = get_local_node_name(); update_node_processes(local_nodeid, local_name, get_process_list()); mainloop_add_signal(SIGTERM, pcmk_shutdown); mainloop_add_signal(SIGINT, pcmk_shutdown); for (start_seq = 1; start_seq < max; start_seq++) { /* dont start anything with start_seq < 1 */ for (lpc = 0; lpc < max; lpc++) { if (start_seq == pcmk_children[lpc].start_seq) { start_child(&(pcmk_children[lpc])); } } } crm_info("Starting mainloop"); g_main_run(mainloop); if(ipcs) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; } g_main_destroy(mainloop); cluster_disconnect_cpg(); cluster_disconnect_cfg(); crm_info("Exiting %s", crm_system_name); return 0; } diff --git a/pacemaker.spec.in b/pacemaker.spec.in index 36514d81dd..8e1c73e696 100644 --- a/pacemaker.spec.in +++ b/pacemaker.spec.in @@ -1,822 +1,822 @@ %global gname haclient %global uname hacluster %global pcmk_docdir %{_docdir}/%{name} %global specversion 1 %global upstream_version HEAD %global upstream_prefix ClusterLabs-pacemaker # Compatibility macros for distros (fedora) that don't provide Python macros by default # Do this instead of trying to conditionally include {_rpmconfigdir}/macros.python %{!?py_ver: %{expand: %%global py_ver %%(echo `python -c "import sys; print sys.version[:3]"`)}} %{!?py_prefix: %{expand: %%global py_prefix %%(echo `python -c "import sys; print sys.prefix"`)}} %{!?py_libdir: %{expand: %%global py_libdir %%{expand:%%%%{py_prefix}/%%%%{_lib}/python%%%%{py_ver}}}} %{!?py_sitedir: %{expand: %%global py_sitedir %%{expand:%%%%{py_libdir}/site-packages}}} # Turn off the auto compilation of python files not in the site-packages directory # Needed so that the -devel package is multilib compliant %global __os_install_post %(echo '%{__os_install_post}' | sed -e 's!/usr/lib[^[:space:]]*/brp-python-bytecompile[[:space:]].*$!!g') # Compatibility macro wrappers for legacy RPM versions that do not # support conditional builds %{!?bcond_without: %{expand: %%global bcond_without() %%{expand:%%%%{!?_without_%%{1}:%%%%global with_%%{1} 1}}}} %{!?bcond_with: %{expand: %%global bcond_with() %%{expand:%%%%{?_with_%%{1}:%%%%global with_%%{1} 1}}}} %{!?with: %{expand: %%global with() %%{expand:%%%%{?with_%%{1}:1}%%%%{!?with_%%{1}:0}}}} %{!?without: %{expand: %%global without() %%{expand:%%%%{?with_%%{1}:0}%%%%{!?with_%%{1}:1}}}} %global cs_major %(pkg-config corosync --modversion | awk -F . '{print $1}') %global cs_minor %(pkg-config corosync --modversion | awk -F . '{print $2}') %global rawhide %(test ! -e /etc/yum.repos.d/fedora-rawhide.repo; echo $?) # Conditionals # Invoke "rpmbuild --without " or "rpmbuild --with " # to disable or enable specific features # Supported cluster stacks, must support at least one %bcond_without corosync %bcond_with heartbeat %bcond_with cman # Legacy stonithd fencing agents %bcond_with stonithd # ESMTP is not available in RHEL, only in EPEL. Allow people to build # the RPM without ESMTP in case they choose not to use EPEL packages %bcond_with esmtp %bcond_with snmp # Build with/without support for profiling tools %bcond_with profiling %bcond_with gcov # We generate docs using Publican, Asciidoc and Inkscape, but they're not available everywhere %bcond_without doc # Use a different versioning scheme %bcond_with pre_release %if %{with profiling} # This disables -debuginfo package creation and also the stripping binaries/libraries # Useful if you want sane profiling data %global debug_package %{nil} %endif %if %{with pre_release} %global pcmk_release 0.%{specversion}.%{upstream_version}.git %else %global pcmk_release %{specversion} %endif Name: pacemaker Summary: Scalable High-Availability cluster resource manager Version: 1.1.7 Release: %{pcmk_release}%{?dist} License: GPLv2+ and LGPLv2+ Url: http://www.clusterlabs.org Group: System Environment/Daemons # export VER={upstream_version} # wget --no-check-certificate -O ClusterLabs-pacemaker-${VER}.tar.gz https://github.com/ClusterLabs/pacemaker/tarball/${VER} Source0: %{upstream_prefix}-%{upstream_version}.tar.gz BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) AutoReqProv: on Requires: resource-agents Requires: %{name}-libs = %{version}-%{release} Requires: %{name}-cluster-libs = %{version}-%{release} Requires: %{name}-cli = %{version}-%{release} Requires: python >= 2.4 %if 0%{?rhel} > 0 ExclusiveArch: i686 x86_64 %endif %if %{defined _unitdir} # Needed for systemd unit Requires(post): systemd-sysv Requires(post): systemd-units Requires(preun): systemd-units Requires(postun): systemd-units %endif %if %{with heartbeat} Requires(pre): cluster-glue %endif %if %{with snmp} Requires: perl(:MODULE_COMPAT_%(eval "`%{__perl} -V:version`"; echo $version)) %endif %if 0%{?suse_version} # net-snmp-devel on SLES10 does not suck in tcpd-devel automatically BuildRequires: tcpd-devel # Suse splits this off into a separate package Requires: python-curses python-xml BuildRequires: python-curses python-xml %endif # Required for core functionality BuildRequires: automake autoconf libtool pkgconfig python libtool-ltdl-devel BuildRequires: glib2-devel libxml2-devel libxslt-devel libuuid-devel BuildRequires: pkgconfig python-devel gcc-c++ bzip2-devel pam-devel %if 0%{?suse_version} >= 1100 # Renamed since opensuse-11.0 BuildRequires: libgnutls-devel %else BuildRequires: gnutls-devel %endif # Enables optional functionality BuildRequires: ncurses-devel openssl-devel libselinux-devel docbook-style-xsl libqb-devel BuildRequires: bison byacc flex help2man %if %{with cman} %if 0%{?fedora} > 0 %if 0%{?fedora} < 17 BuildRequires: clusterlib-devel %endif %endif %if 0%{?rhel} > 0 %if 0%{?rhel} < 7 BuildRequires: clusterlib-devel %endif %endif %endif %if %{with esmtp} BuildRequires: libesmtp-devel %endif %if %{with snmp} %ifarch alpha %{ix86} x86_64 BuildRequires: lm_sensors-devel %endif BuildRequires: net-snmp-devel %endif %if %{with corosync} Requires: corosync BuildRequires: corosynclib-devel %endif %if %{with heartbeat} # Do not require heartbeat, the admin should select which stack to use and install it BuildRequires: cluster-glue-libs-devel heartbeat-devel heartbeat-libs >= 3.0.0 %endif %if %{with stonithd} BuildRequires: cluster-glue-libs-devel %endif %if !%{rawhide} # More often than not, inkscape is busted on rawhide, don't even bother %if %{with doc} %ifarch %{ix86} x86_64 BuildRequires: publican inkscape asciidoc %endif %endif %endif %description Pacemaker is an advanced, scalable High-Availability cluster resource manager for Linux-HA (Heartbeat) and/or Corosync. It supports "n-node" clusters with significant capabilities for managing resources and dependencies. It will run scripts at initialization, when machines go up or down, when related resources fail and can be configured to periodically check resource health. Available rpmbuild rebuild options: --with(out) : heartbeat cman corosync doc publican snmp esmtp pre_release %package cli License: GPLv2+ and LGPLv2+ Summary: Command line tools for controlling Pacemaker clusters Group: System Environment/Daemons Requires: %{name}-libs = %{version}-%{release} %description cli Pacemaker is an advanced, scalable High-Availability cluster resource manager for Linux-HA (Heartbeat) and/or Corosync. The %{name}-cli package contains command line tools that can be used to query and control the cluster from machines that may, or may not, be part of the cluster. %package -n %{name}-libs License: GPLv2+ and LGPLv2+ Summary: Core Pacemaker libraries Group: System Environment/Daemons %description -n %{name}-libs Pacemaker is an advanced, scalable High-Availability cluster resource manager for Linux-HA (Heartbeat) and/or Corosync. The %{name}-libs package contains shared libraries needed for cluster nodes and those just running the CLI tools. %package -n %{name}-cluster-libs License: GPLv2+ and LGPLv2+ Summary: Cluster Libraries used by Pacemaker Group: System Environment/Daemons Requires: %{name}-libs = %{version}-%{release} %description -n %{name}-cluster-libs Pacemaker is an advanced, scalable High-Availability cluster resource manager for Linux-HA (Heartbeat) and/or Corosync. The %{name}-cluster-libs package contains cluster-aware shared libraries needed for nodes that will form part of the cluster nodes. %package -n %{name}-libs-devel License: GPLv2+ and LGPLv2+ Summary: Pacemaker development package Group: Development/Libraries Requires: %{name}-cts = %{version}-%{release} Requires: %{name}-libs = %{version}-%{release} Requires: %{name}-cluster-libs = %{version}-%{release} -Requires: libtool-ltdl-devel libqb-devel libuuid-devel libuuid-devel +Requires: libtool-ltdl-devel libqb-devel libuuid-devel Requires: libxml2-devel libxslt-devel bzip2-devel glib2-devel %if %{with corosync} Requires: corosynclib-devel %endif %if %{with heartbeat} Requires: cluster-glue-libs-devel heartbeat-devel %endif %description -n %{name}-libs-devel Pacemaker is an advanced, scalable High-Availability cluster resource manager for Linux-HA (Heartbeat) and/or Corosync. The %{name}-libs-devel package contains headers and shared libraries for developing tools for Pacemaker. %package cts License: GPLv2+ and LGPLv2+ Summary: Test framework for cluster-related technologies like Pacemaker Group: System Environment/Daemons Requires: python %description cts Test framework for cluster-related technologies like Pacemaker %package doc License: GPLv2+ and LGPLv2+ Summary: Documentation for Pacemaker Group: Documentation %description doc Documentation for Pacemaker. Pacemaker is an advanced, scalable High-Availability cluster resource manager for Linux-HA (Heartbeat) and/or Corosync. %prep %setup -q -n %{upstream_prefix}-%{upstream_version} # Force the local time # # 'hg archive' sets the file date to the date of the last commit. # This can result in files having been created in the future # when building on machines in timezones 'behind' the one the # commit occurred in - which seriously confuses 'make' find . -exec touch \{\} \; %build ./autogen.sh %if %{with snmp} eval `objdump --headers --private-headers /usr/bin/perl | grep RPATH | awk '{print "export LD_LIBRARY_PATH="$2}'` %endif # RHEL <= 5 does not support --docdir docdir=%{pcmk_docdir} %{configure} \ %{!?with_heartbeat: --without-heartbeat} \ %{!?with_corosync: --without-ais} \ %{!?with_esmtp: --without-esmtp} \ %{!?with_snmp: --without-snmp} \ %{?with_cman: --with-cman} \ %{?with_profiling: --with-profiling} \ %{?with_gcov: --with-gcov} \ --with-initdir=%{_initrddir} \ --localstatedir=%{_var} \ --with-version=%{version}-%{release} make %{_smp_mflags} V=1 docdir=%{pcmk_docdir} all %install rm -rf %{buildroot} make DESTDIR=%{buildroot} docdir=%{pcmk_docdir} V=1 install mkdir -p ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig mkdir -p ${RPM_BUILD_ROOT}%{_var}/lib/pacemaker/cores install -m 644 mcp/pacemaker.sysconfig ${RPM_BUILD_ROOT}%{_sysconfdir}/sysconfig/pacemaker # Scripts that should be executable chmod a+x %{buildroot}/%{_datadir}/pacemaker/tests/cts/CTSlab.py # These are not actually scripts find %{buildroot} -name '*.xml' -type f -print0 | xargs -0 chmod a-x find %{buildroot} -name '*.xsl' -type f -print0 | xargs -0 chmod a-x find %{buildroot} -name '*.rng' -type f -print0 | xargs -0 chmod a-x find %{buildroot} -name '*.dtd' -type f -print0 | xargs -0 chmod a-x # Dont package static libs find %{buildroot} -name '*.a' -type f -print0 | xargs -0 rm -f find %{buildroot} -name '*.la' -type f -print0 | xargs -0 rm -f # Do not package these either rm -f %{buildroot}/%{_libdir}/service_crm.so %if %{with gcov} GCOV_BASE=%{buildroot}/%{_var}/lib/pacemaker/gcov mkdir -p $GCOV_BASE find . -name '*.gcno' -type f | while read F ; do D=`dirname $F` mkdir -p ${GCOV_BASE}/$D cp $F ${GCOV_BASE}/$D done %endif %clean rm -rf %{buildroot} %post %if %{defined _unitdir} /bin/systemctl daemon-reload >/dev/null 2>&1 || : %endif /sbin/chkconfig --add pacemaker || : %preun if [ $1 -eq 0 ]; then # Package removal, not upgrade /sbin/service pacemaker stop &>/dev/null || : /sbin/chkconfig --del pacemaker || : fi %pre -n %{name}-libs getent group %{gname} >/dev/null || groupadd -r %{gname} getent passwd %{uname} >/dev/null || useradd -r -g %{gname} -s /sbin/nologin -c "heartbeat user" %{uname} exit 0 %post -n %{name}-libs -p /sbin/ldconfig %postun -n %{name}-libs -p /sbin/ldconfig %post -n %{name}-cluster-libs -p /sbin/ldconfig %postun -n %{name}-cluster-libs -p /sbin/ldconfig %files ########################################################### %defattr(-,root,root) %exclude %{_datadir}/pacemaker/tests %config(noreplace) %{_sysconfdir}/sysconfig/pacemaker %if %{with corosync} %{_sbindir}/pacemakerd %{_initrddir}/pacemaker %endif %if %{defined _unitdir} %{_unitdir}/pacemaker.service %endif %{_datadir}/pacemaker %{_datadir}/snmp/mibs/PCMK-MIB.txt %exclude %{_libexecdir}/pacemaker/lrmd_test %{_libexecdir}/pacemaker/* %if %{with heartbeat} %{_libdir}/heartbeat/* %endif %{_sbindir}/crm_attribute %{_sbindir}/crm_master %{_sbindir}/crm_node %{_sbindir}/attrd_updater %{_sbindir}/fence_legacy %{_sbindir}/fence_pcmk %{_bindir}/ccs2cib %{_bindir}/ccs_flatten %{_bindir}/disable_rgmanager %{_sbindir}/stonith_admin %if %{with heartbeat} %{_sbindir}/crm_uuid %endif %doc %{_mandir}/man7/* %doc %{_mandir}/man8/attrd_updater.* %doc %{_mandir}/man8/crm_attribute.* %doc %{_mandir}/man8/crm_node.* %doc %{_mandir}/man8/crm_master.* %doc %{_mandir}/man8/fence_pcmk.* %if %{with corosync} %doc %{_mandir}/man8/pacemakerd.* %endif %doc %{_mandir}/man8/stonith_admin.* %doc COPYING %doc AUTHORS %doc ChangeLog %dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker %dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker/cib %dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker/cores %dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker/pengine %dir %attr (750, %{uname}, %{gname}) %{_var}/lib/pacemaker/blackbox %ghost %dir %attr (750, %{uname}, %{gname}) %{_var}/run/crm %dir /usr/lib/ocf %dir /usr/lib/ocf/resource.d /usr/lib/ocf/resource.d/pacemaker %if %{with corosync} %if 0%{?cs_major} < 2 %if 0%{?cs_minor} < 8 %{_libexecdir}/lcrso/pacemaker.lcrso %endif %endif %endif %files cli %defattr(-,root,root) %{_sbindir}/cibadmin %{_sbindir}/crm_diff %{_sbindir}/crm_error %{_sbindir}/crm_failcount %{_sbindir}/crm_mon %{_sbindir}/crm_resource %{_sbindir}/crm_standby %{_sbindir}/crm_verify %{_sbindir}/crmadmin %{_sbindir}/iso8601 %{_sbindir}/crm_shadow %{_sbindir}/crm_simulate %{_sbindir}/crm_report %{_sbindir}/crm_ticket %doc %{_mandir}/man8/* %exclude %{_mandir}/man8/attrd_updater.* %exclude %{_mandir}/man8/crm_attribute.* %exclude %{_mandir}/man8/crm_node.* %exclude %{_mandir}/man8/crm_master.* %exclude %{_mandir}/man8/fence_pcmk.* %if %{with corosync} %exclude %{_mandir}/man8/pacemakerd.* %endif %exclude %{_mandir}/man8/stonith_admin.* %doc COPYING %doc AUTHORS %doc ChangeLog %files -n %{name}-libs %defattr(-,root,root) %{_libdir}/libcib.so.* %{_libdir}/liblrmd.so.* %{_libdir}/libcrmservice.so.* %{_libdir}/libcrmcommon.so.* %{_libdir}/libpe_status.so.* %{_libdir}/libpe_rules.so.* %{_libdir}/libpengine.so.* %{_libdir}/libstonithd.so.* %{_libdir}/libtransitioner.so.* %doc COPYING.LIB %doc AUTHORS %files -n %{name}-cluster-libs %defattr(-,root,root) %{_libdir}/libcrmcluster.so.* %doc COPYING.LIB %doc AUTHORS %files doc %defattr(-,root,root) %doc %{pcmk_docdir} %files cts %defattr(-,root,root) %{py_sitedir}/cts %{_datadir}/pacemaker/tests/cts %{_libexecdir}/pacemaker/lrmd_test %doc COPYING.LIB %doc AUTHORS %files -n %{name}-libs-devel %defattr(-,root,root) %exclude %{_datadir}/pacemaker/tests/cts %{_datadir}/pacemaker/tests %{_includedir}/pacemaker %{_libdir}/*.so %if %{with gcov} %{_var}/lib/pacemaker %endif %{_libdir}/pkgconfig/*.pc %doc COPYING.LIB %doc AUTHORS %changelog * Wed Jun 20 2012 Andrew Beekhof Pacemaker-1.1.8-0.1 - 1.1.8 pre-releases * Wed Mar 28 2012 Andrew Beekhof Pacemaker-1.1.7-1 - Update source tarball to revision: bc7ff2c - Statistics: Changesets: 513 Diff: 1171 files changed, 90472 insertions(+), 19368 deletions(-) - See included ChangeLog file or https://github.com/ClusterLabs/pacemaker/blob/master/ChangeLog for details * Wed Aug 31 2011 Andrew Beekhof 1.1.6-1 - Update source tarball to revision: 676e5f25aa46 tip - Statistics: Changesets: 376 Diff: 1761 files changed, 36259 insertions(+), 140578 deletions(-) - See included ChangeLog file or https://github.com/ClusterLabs/pacemaker/blob/master/ChangeLog for details * Fri Feb 11 2011 Andrew Beekhof 1.1.5-1 - Update source tarball to revision: baad6636a053 - Statistics: Changesets: 184 Diff: 605 files changed, 46103 insertions(+), 26417 deletions(-) - See included ChangeLog file or https://github.com/ClusterLabs/pacemaker/blob/master/ChangeLog for details * Wed Oct 20 2010 Andrew Beekhof 1.1.4-1 - Moved all the interesting parts of the changelog into a separate file as per the Fedora policy :-/ - Update source tarball to revision: 75406c3eb2c1 tip - Significant performance enhancements to the Policy Engine and CIB - Statistics: Changesets: 169 Diff: 772 files changed, 56172 insertions(+), 39309 deletions(-) - See included ChangeLog file or http://hg.clusterlabs.org/pacemaker/1.1/file/tip/ChangeLog for details * Tue Sep 21 2010 Andrew Beekhof 1.1.3-1 - Update source tarball to revision: e3bb31c56244 tip - Statistics: Changesets: 352 Diff: 481 files changed, 14130 insertions(+), 11156 deletions(-) * Wed May 12 2010 Andrew Beekhof 1.1.2-1 - Update source tarball to revision: c25c972a25cc tip - Statistics: Changesets: 339 Diff: 708 files changed, 37918 insertions(+), 10584 deletions(-) * Tue Feb 16 2010 Andrew Beekhof - 1.1.1-1 - First public release of Pacemaker 1.1 - Package reference documentation in a doc subpackage - Move cts into a subpackage so that it can be easily consumed by others - Update source tarball to revision: 17d9cd4ee29f + New stonith daemon that supports global notifications + Service placement influenced by the physical resources + A new tool for simulating failures and the cluster’s reaction to them + Ability to serialize an otherwise unrelated a set of resource actions (eg. Xen migrations) * Wed Feb 10 2010 Andrew Beekhof - 1.0.7-4 - Rebuild for heartbeat 3.0.2-2 * Wed Feb 10 2010 Andrew Beekhof - 1.0.7-3 - Rebuild for cluster-glue 1.0.3 * Tue Jan 19 2010 Andrew Beekhof - 1.0.7-2 - Rebuild for corosync 1.2.0 * Mon Jan 18 2010 Andrew Beekhof - 1.0.7-1 - Update source tarball to revision: 2eed906f43e9 (stable-1.0) tip - Statistics: Changesets: 193 Diff: 220 files changed, 15933 insertions(+), 8782 deletions(-) * Thu Oct 29 2009 Andrew Beekhof - 1.0.5-4 - Include the fixes from CoroSync integration testing - Move the resource templates - they are not documentation - Ensure documentation is placed in a standard location - Exclude documentation that is included elsewhere in the package - Update the tarball from upstream to version ee19d8e83c2a + High: cib: Correctly clean up when both plaintext and tls remote ports are requested + High: PE: Bug bnc#515172 - Provide better defaults for lt(e) and gt(e) comparisions + High: PE: Bug lf#2197 - Allow master instances placemaker to be influenced by colocation constraints + High: PE: Make sure promote/demote pseudo actions are created correctly + High: PE: Prevent target-role from promoting more than master-max instances + High: ais: Bug lf#2199 - Prevent expected-quorum-votes from being populated with garbage + High: ais: Prevent deadlock - dont try to release IPC message if the connection failed + High: cib: For validation errors, send back the full CIB so the client can display the errors + High: cib: Prevent use-after-free for remote plaintext connections + High: crmd: Bug lf#2201 - Prevent use-of-NULL when running heartbeat * Wed Oct 13 2009 Andrew Beekhof - 1.0.5-3 - Update the tarball from upstream to version 38cd629e5c3c + High: Core: Bug lf#2169 - Allow dtd/schema validation to be disabled + High: PE: Bug lf#2106 - Not all anonymous clone children are restarted after configuration change + High: PE: Bug lf#2170 - stop-all-resources option had no effect + High: PE: Bug lf#2171 - Prevent groups from starting if they depend on a complex resource which can not + High: PE: Disable resource management if stonith-enabled=true and no stonith resources are defined + High: PE: do not include master score if it would prevent allocation + High: ais: Avoid excessive load by checking for dead children every 1s (instead of 100ms) + High: ais: Bug rh#525589 - Prevent shutdown deadlocks when running on CoroSync + High: ais: Gracefully handle changes to the AIS nodeid + High: crmd: Bug bnc#527530 - Wait for the transition to complete before leaving S_TRANSITION_ENGINE + High: crmd: Prevent use-after-free with LOG_DEBUG_3 + Medium: xml: Mask the "symmetrical" attribute on rsc_colocation constraints (bnc#540672) + Medium (bnc#520707): Tools: crm: new templates ocfs2 and clvm + Medium: Build: Invert the disable ais/heartbeat logic so that --without (ais|heartbeat) is available to rpmbuild + Medium: PE: Bug lf#2178 - Indicate unmanaged clones + Medium: PE: Bug lf#2180 - Include node information for all failed ops + Medium: PE: Bug lf#2189 - Incorrect error message when unpacking simple ordering constraint + Medium: PE: Correctly log resources that would like to start but can not + Medium: PE: Stop ptest from logging to syslog + Medium: ais: Include version details in plugin name + Medium: crmd: Requery the resource metadata after every start operation * Fri Aug 21 2009 Tomas Mraz - 1.0.5-2.1 - rebuilt with new openssl * Wed Aug 19 2009 Andrew Beekhof - 1.0.5-2 - Add versioned perl dependency as specified by https://fedoraproject.org/wiki/Packaging/Perl#Packages_that_link_to_libperl - No longer remove RPATH data, it prevents us finding libperl.so and no other libraries were being hardcoded - Compile in support for heartbeat - Conditionally add heartbeat-devel and corosynclib-devel to the -devel requirements depending on which stacks are supported * Mon Aug 17 2009 Andrew Beekhof - 1.0.5-1 - Add dependency on resource-agents - Use the version of the configure macro that supplies --prefix, --libdir, etc - Update the tarball from upstream to version 462f1569a437 (Pacemaker 1.0.5 final) + High: Tools: crm_resource - Advertise --move instead of --migrate + Medium: Extra: New node connectivity RA that uses system ping and attrd_updater + Medium: crmd: Note that dc-deadtime can be used to mask the brokeness of some switches * Tue Aug 11 2009 Ville Skyttä - 1.0.5-0.7.c9120a53a6ae.hg - Use bzipped upstream tarball. * Wed Jul 29 2009 Andrew Beekhof - 1.0.5-0.6.c9120a53a6ae.hg - Add back missing build auto* dependancies - Minor cleanups to the install directive * Tue Jul 28 2009 Andrew Beekhof - 1.0.5-0.5.c9120a53a6ae.hg - Add a leading zero to the revision when alphatag is used * Tue Jul 28 2009 Andrew Beekhof - 1.0.5-0.4.c9120a53a6ae.hg - Incorporate the feedback from the cluster-glue review - Realistically, the version is a 1.0.5 pre-release - Use the global directive instead of define for variables - Use the haclient/hacluster group/user instead of daemon - Use the _configure macro - Fix install dependancies * Fri Jul 24 2009 Andrew Beekhof - 1.0.4-3 - Initial Fedora checkin - Include an AUTHORS and license file in each package - Change the library package name to pacemaker-libs to be more Fedora compliant - Remove execute permissions from xml related files - Reference the new cluster-glue devel package name - Update the tarball from upstream to version c9120a53a6ae + High: PE: Only prevent migration if the clone dependency is stopping/starting on the target node + High: PE: Bug 2160 - Dont shuffle clones due to colocation + High: PE: New implementation of the resource migration (not stop/start) logic + Medium: Tools: crm_resource - Prevent use-of-NULL by requiring a resource name for the -A and -a options + Medium: PE: Prevent use-of-NULL in find_first_action() * Tue Jul 14 2009 Andrew Beekhof - 1.0.4-2 - Reference authors from the project AUTHORS file instead of listing in description - Change Source0 to reference the Mercurial repo - Cleaned up the summaries and descriptions - Incorporate the results of Fedora package self-review * Thu Jun 04 2009 Andrew Beekhof - 1.0.4-1 - Update source tarball to revision: 1d87d3e0fc7f (stable-1.0) - Statistics: Changesets: 209 Diff: 266 files changed, 12010 insertions(+), 8276 deletions(-) * Wed Apr 08 2009 Andrew Beekhof - 1.0.3-1 - Update source tarball to revision: b133b3f19797 (stable-1.0) tip - Statistics: Changesets: 383 Diff: 329 files changed, 15471 insertions(+), 15119 deletions(-) * Mon Feb 16 2009 Andrew Beekhof - 1.0.2-1 - Update source tarball to revision: d232d19daeb9 (stable-1.0) tip - Statistics: Changesets: 441 Diff: 639 files changed, 20871 insertions(+), 21594 deletions(-) * Tue Nov 18 2008 Andrew Beekhof - 1.0.1-1 - Update source tarball to revision: 6fc5ce8302ab (stable-1.0) tip - Statistics: Changesets: 170 Diff: 816 files changed, 7633 insertions(+), 6286 deletions(-) * Thu Oct 16 2008 Andrew Beekhof - 1.0.0-1 - Update source tarball to revision: 388654dfef8f tip - Statistics: Changesets: 261 Diff: 3021 files changed, 244985 insertions(+), 111596 deletions(-) * Mon Sep 22 2008 Andrew Beekhof - 0.7.3-1 - Update source tarball to revision: 33e677ab7764+ tip - Statistics: Changesets: 133 Diff: 89 files changed, 7492 insertions(+), 1125 deletions(-) * Wed Aug 20 2008 Andrew Beekhof - 0.7.1-1 - Update source tarball to revision: f805e1b30103+ tip - Statistics: Changesets: 184 Diff: 513 files changed, 43408 insertions(+), 43783 deletions(-) * Fri Jul 18 2008 Andrew Beekhof - 0.7.0-19 - Update source tarball to revision: 007c3a1c50f5 (unstable) tip - Statistics: Changesets: 108 Diff: 216 files changed, 4632 insertions(+), 4173 deletions(-) * Wed Jun 25 2008 Andrew Beekhof - 0.7.0-1 - Update source tarball to revision: bde0c7db74fb tip - Statistics: Changesets: 439 Diff: 676 files changed, 41310 insertions(+), 52071 deletions(-) * Thu Jun 19 2008 Andrew Beekhof - 0.6.5-1 - Update source tarball to revision: b9fe723d1ac5 tip - Statistics: Changesets: 48 Diff: 37 files changed, 1204 insertions(+), 234 deletions(-) * Thu May 22 2008 Andrew Beekhof - 0.6.4-1 - Update source tarball to revision: 226d8e356924 tip - Statistics: Changesets: 55 Diff: 199 files changed, 7103 insertions(+), 12378 deletions(-) * Wed Apr 23 2008 Andrew Beekhof - 0.6.3-1 - Update source tarball to revision: fd8904c9bc67 tip - Statistics: Changesets: 117 Diff: 354 files changed, 19094 insertions(+), 11338 deletions(-) * Thu Feb 14 2008 Andrew Beekhof - 0.6.2-1 - Update source tarball to revision: 28b1a8c1868b tip - Statistics: Changesets: 11 Diff: 7 files changed, 58 insertions(+), 18 deletions(-) * Tue Feb 12 2008 Andrew Beekhof - 0.6.1-1 - Update source tarball to revision: e7152d1be933 tip - Statistics: Changesets: 25 Diff: 37 files changed, 1323 insertions(+), 227 deletions(-) * Mon Jan 14 2008 Andrew Beekhof - 0.6.0-2 - This is the first release of the Pacemaker Cluster Resource Manager formerly part of Heartbeat. - For those looking for the GUI, mgmtd, CIM or TSA components, they are now found in the new pacemaker-pygui project. Build dependancies prevent them from being included in Heartbeat (since the built-in CRM is no longer supported) and, being non-core components, are not included with Pacemaker. - Update source tarball to revision: c94b92d550cf - Statistics: Changesets: 347 Diff: 2272 files changed, 132508 insertions(+), 305991 deletions(-) - Test hardware: + 6-node vmware cluster (sles10-sp1/256Mb/vmware stonith) on a single host (opensuse10.3/2Gb/2.66Ghz Quad Core2) + 7-node EMC Centera cluster (sles10/512Mb/2Ghz Xeon/ssh stonith) - Notes: Heartbeat Stack + All testing was performed with STONITH enabled + The CRM was enabled using the "crm respawn" directive - Notes: OpenAIS Stack + This release contains a preview of support for the OpenAIS cluster stack + The current release of the OpenAIS project is missing two important patches that we require. OpenAIS packages containing these patches are available for most major distributions at: http://download.opensuse.org/repositories/server:/ha-clustering + The OpenAIS stack is not currently recommended for use in clusters that have shared data as STONITH support is not yet implimented + pingd is not yet available for use with the OpenAIS stack + 3 significant OpenAIS issues were found during testing of 4 and 6 node clusters. We are activly working together with the OpenAIS project to get these resolved. - Pending bugs encountered during testing: + OpenAIS #1736 - Openais membership took 20s to stabilize + Heartbeat #1750 - ipc_bufpool_update: magic number in head does not match + OpenAIS #1793 - Assertion failure in memb_state_gather_enter() + OpenAIS #1796 - Cluster message corruption * Mon Dec 10 2007 Andrew Beekhof - 0.6.0-1 - Initial opensuse package check-in diff --git a/pengine/group.c b/pengine/group.c index 2733ef797c..8fd1152c22 100644 --- a/pengine/group.c +++ b/pengine/group.c @@ -1,511 +1,526 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #define VARIANT_GROUP 1 #include node_t * group_color(resource_t * rsc, node_t * prefer, pe_working_set_t * data_set) { node_t *node = NULL; node_t *group_node = NULL; GListPtr gIter = NULL; group_variant_data_t *group_data = NULL; get_group_variant_data(group_data, rsc); if (is_not_set(rsc->flags, pe_rsc_provisional)) { return rsc->allocated_to; } pe_rsc_trace(rsc, "Processing %s", rsc->id); if (is_set(rsc->flags, pe_rsc_allocating)) { pe_rsc_debug(rsc, "Dependency loop detected involving %s", rsc->id); return NULL; } if (group_data->first_child == NULL) { /* nothign to allocate */ clear_bit(rsc->flags, pe_rsc_provisional); return NULL; } set_bit(rsc->flags, pe_rsc_allocating); rsc->role = group_data->first_child->role; group_data->first_child->rsc_cons = g_list_concat(group_data->first_child->rsc_cons, rsc->rsc_cons); rsc->rsc_cons = NULL; group_data->first_child->rsc_cons_lhs = g_list_concat(group_data->first_child->rsc_cons_lhs, rsc->rsc_cons_lhs); rsc->rsc_cons_lhs = NULL; gIter = rsc->rsc_tickets; for (; gIter != NULL; gIter = gIter->next) { rsc_ticket_t *rsc_ticket = (rsc_ticket_t *) gIter->data; if (rsc_ticket->ticket->granted == FALSE || rsc_ticket->ticket->standby) { rsc_ticket_constraint(rsc, rsc_ticket, data_set); } } dump_node_scores(show_scores ? 0 : scores_log_level, rsc, __PRETTY_FUNCTION__, rsc->allowed_nodes); gIter = rsc->children; for (; gIter != NULL; gIter = gIter->next) { resource_t *child_rsc = (resource_t *) gIter->data; node = child_rsc->cmds->allocate(child_rsc, prefer, data_set); if (group_node == NULL) { group_node = node; } } rsc->next_role = group_data->first_child->next_role; clear_bit(rsc->flags, pe_rsc_allocating); clear_bit(rsc->flags, pe_rsc_provisional); if (group_data->colocated) { return group_node; } return NULL; } void group_update_pseudo_status(resource_t * parent, resource_t * child); void group_create_actions(resource_t * rsc, pe_working_set_t * data_set) { action_t *op = NULL; const char *value = NULL; GListPtr gIter = rsc->children; pe_rsc_trace(rsc, "Creating actions for %s", rsc->id); for (; gIter != NULL; gIter = gIter->next) { resource_t *child_rsc = (resource_t *) gIter->data; child_rsc->cmds->create_actions(child_rsc, data_set); group_update_pseudo_status(rsc, child_rsc); } op = start_action(rsc, NULL, TRUE /* !group_data->child_starting */ ); set_bit(op->flags, pe_action_pseudo | pe_action_runnable); op = custom_action(rsc, started_key(rsc), RSC_STARTED, NULL, TRUE /* !group_data->child_starting */ , TRUE, data_set); set_bit(op->flags, pe_action_pseudo | pe_action_runnable); op = stop_action(rsc, NULL, TRUE /* !group_data->child_stopping */ ); set_bit(op->flags, pe_action_pseudo | pe_action_runnable); op = custom_action(rsc, stopped_key(rsc), RSC_STOPPED, NULL, TRUE /* !group_data->child_stopping */ , TRUE, data_set); set_bit(op->flags, pe_action_pseudo | pe_action_runnable); value = g_hash_table_lookup(rsc->meta, "stateful"); if (crm_is_true(value)) { op = custom_action(rsc, demote_key(rsc), RSC_DEMOTE, NULL, TRUE, TRUE, data_set); set_bit(op->flags, pe_action_pseudo); set_bit(op->flags, pe_action_runnable); op = custom_action(rsc, demoted_key(rsc), RSC_DEMOTED, NULL, TRUE, TRUE, data_set); set_bit(op->flags, pe_action_pseudo); set_bit(op->flags, pe_action_runnable); op = custom_action(rsc, promote_key(rsc), RSC_PROMOTE, NULL, TRUE, TRUE, data_set); set_bit(op->flags, pe_action_pseudo); set_bit(op->flags, pe_action_runnable); op = custom_action(rsc, promoted_key(rsc), RSC_PROMOTED, NULL, TRUE, TRUE, data_set); set_bit(op->flags, pe_action_pseudo); set_bit(op->flags, pe_action_runnable); } } void group_update_pseudo_status(resource_t * parent, resource_t * child) { GListPtr gIter = child->actions; group_variant_data_t *group_data = NULL; get_group_variant_data(group_data, parent); if (group_data->ordered == FALSE) { /* If this group is not ordered, then leave the meta-actions as optional */ return; } if (group_data->child_stopping && group_data->child_starting) { return; } for (; gIter != NULL; gIter = gIter->next) { action_t *action = (action_t *) gIter->data; if (is_set(action->flags, pe_action_optional)) { continue; } if (safe_str_eq(RSC_STOP, action->task) && is_set(action->flags, pe_action_runnable)) { group_data->child_stopping = TRUE; pe_rsc_trace(action->rsc, "Based on %s the group is stopping", action->uuid); } else if (safe_str_eq(RSC_START, action->task) && is_set(action->flags, pe_action_runnable)) { group_data->child_starting = TRUE; pe_rsc_trace(action->rsc, "Based on %s the group is starting", action->uuid); } } } void group_internal_constraints(resource_t * rsc, pe_working_set_t * data_set) { GListPtr gIter = rsc->children; resource_t *last_rsc = NULL; + resource_t *last_active = NULL; resource_t *top = uber_parent(rsc); group_variant_data_t *group_data = NULL; get_group_variant_data(group_data, rsc); new_rsc_order(rsc, RSC_STOPPED, rsc, RSC_START, pe_order_optional, data_set); new_rsc_order(rsc, RSC_START, rsc, RSC_STARTED, pe_order_runnable_left, data_set); new_rsc_order(rsc, RSC_STOP, rsc, RSC_STOPPED, pe_order_runnable_left, data_set); for (; gIter != NULL; gIter = gIter->next) { resource_t *child_rsc = (resource_t *) gIter->data; int stop = pe_order_none; int stopped = pe_order_implies_then_printed; int start = pe_order_implies_then | pe_order_runnable_left; int started = pe_order_runnable_left | pe_order_implies_then | pe_order_implies_then_printed; child_rsc->cmds->internal_constraints(child_rsc, data_set); if (last_rsc == NULL) { if (group_data->ordered) { stop |= pe_order_optional; stopped = pe_order_implies_then; } } else if (group_data->colocated) { rsc_colocation_new("group:internal_colocation", NULL, INFINITY, child_rsc, last_rsc, NULL, NULL, data_set); } if (top->variant == pe_master) { new_rsc_order(rsc, RSC_DEMOTE, child_rsc, RSC_DEMOTE, stop | pe_order_implies_first_printed, data_set); new_rsc_order(child_rsc, RSC_DEMOTE, rsc, RSC_DEMOTED, stopped, data_set); new_rsc_order(child_rsc, RSC_PROMOTE, rsc, RSC_PROMOTED, started, data_set); new_rsc_order(rsc, RSC_PROMOTE, child_rsc, RSC_PROMOTE, pe_order_implies_first_printed, data_set); } order_start_start(rsc, child_rsc, pe_order_implies_first_printed); order_stop_stop(rsc, child_rsc, stop | pe_order_implies_first_printed); new_rsc_order(child_rsc, RSC_STOP, rsc, RSC_STOPPED, stopped, data_set); new_rsc_order(child_rsc, RSC_START, rsc, RSC_STARTED, started, data_set); if (group_data->ordered == FALSE) { order_start_start(rsc, child_rsc, start | pe_order_implies_first_printed); if (top->variant == pe_master) { new_rsc_order(rsc, RSC_PROMOTE, child_rsc, RSC_PROMOTE, start | pe_order_implies_first_printed, data_set); } } else if (last_rsc != NULL) { child_rsc->restart_type = pe_restart_restart; order_start_start(last_rsc, child_rsc, start); order_stop_stop(child_rsc, last_rsc, pe_order_optional|pe_order_restart); if (top->variant == pe_master) { new_rsc_order(last_rsc, RSC_PROMOTE, child_rsc, RSC_PROMOTE, start, data_set); new_rsc_order(child_rsc, RSC_DEMOTE, last_rsc, RSC_DEMOTE, pe_order_optional, data_set); } } else { /* If anyone in the group is starting, then * pe_order_implies_then will cause _everyone_ in the group * to be sent a start action * But this is safe since starting something that is already * started is required to be "safe" */ int flags = pe_order_none; order_start_start(rsc, child_rsc, flags); if (top->variant == pe_master) { new_rsc_order(rsc, RSC_PROMOTE, child_rsc, RSC_PROMOTE, flags, data_set); } } + /* Look for partially active groups + * Make sure they still shut down in sequence + */ + if(child_rsc->running_on) { + if(group_data->ordered + && last_rsc + && last_rsc->running_on == NULL + && last_active + && last_active->running_on) { + order_stop_stop(child_rsc, last_active, pe_order_optional); + } + last_active = child_rsc; + } + last_rsc = child_rsc; } if (group_data->ordered && last_rsc != NULL) { int stop_stop_flags = pe_order_implies_then; int stop_stopped_flags = pe_order_optional; order_stop_stop(rsc, last_rsc, stop_stop_flags); new_rsc_order(last_rsc, RSC_STOP, rsc, RSC_STOPPED, stop_stopped_flags, data_set); if (top->variant == pe_master) { new_rsc_order(rsc, RSC_DEMOTE, last_rsc, RSC_DEMOTE, stop_stop_flags, data_set); new_rsc_order(last_rsc, RSC_DEMOTE, rsc, RSC_DEMOTED, stop_stopped_flags, data_set); } } } void group_rsc_colocation_lh(resource_t * rsc_lh, resource_t * rsc_rh, rsc_colocation_t * constraint) { GListPtr gIter = NULL; group_variant_data_t *group_data = NULL; if (rsc_lh == NULL) { pe_err("rsc_lh was NULL for %s", constraint->id); return; } else if (rsc_rh == NULL) { pe_err("rsc_rh was NULL for %s", constraint->id); return; } gIter = rsc_lh->children; pe_rsc_trace(rsc_lh, "Processing constraints from %s", rsc_lh->id); get_group_variant_data(group_data, rsc_lh); if (group_data->colocated) { group_data->first_child->cmds->rsc_colocation_lh(group_data->first_child, rsc_rh, constraint); return; } else if (constraint->score >= INFINITY) { crm_config_err("%s: Cannot perform manditory colocation" " between non-colocated group and %s", rsc_lh->id, rsc_rh->id); return; } for (; gIter != NULL; gIter = gIter->next) { resource_t *child_rsc = (resource_t *) gIter->data; child_rsc->cmds->rsc_colocation_lh(child_rsc, rsc_rh, constraint); } } void group_rsc_colocation_rh(resource_t * rsc_lh, resource_t * rsc_rh, rsc_colocation_t * constraint) { GListPtr gIter = rsc_rh->children; group_variant_data_t *group_data = NULL; get_group_variant_data(group_data, rsc_rh); CRM_CHECK(rsc_lh->variant == pe_native, return); pe_rsc_trace(rsc_rh, "Processing RH of constraint %s", constraint->id); print_resource(LOG_DEBUG_3, "LHS", rsc_lh, TRUE); if (is_set(rsc_rh->flags, pe_rsc_provisional)) { return; } else if (group_data->colocated && group_data->first_child) { if (constraint->score >= INFINITY) { /* Ensure RHS is _fully_ up before can start LHS */ group_data->last_child->cmds->rsc_colocation_rh(rsc_lh, group_data->last_child, constraint); } else { /* A partially active RHS is fine */ group_data->first_child->cmds->rsc_colocation_rh(rsc_lh, group_data->first_child, constraint); } return; } else if (constraint->score >= INFINITY) { crm_config_err("%s: Cannot perform manditory colocation with" " non-colocated group: %s", rsc_lh->id, rsc_rh->id); return; } for (; gIter != NULL; gIter = gIter->next) { resource_t *child_rsc = (resource_t *) gIter->data; child_rsc->cmds->rsc_colocation_rh(rsc_lh, child_rsc, constraint); } } enum pe_action_flags group_action_flags(action_t * action, node_t * node) { GListPtr gIter = NULL; enum pe_action_flags flags = (pe_action_optional | pe_action_runnable | pe_action_pseudo); for (gIter = action->rsc->children; gIter != NULL; gIter = gIter->next) { resource_t *child = (resource_t *) gIter->data; enum action_tasks task = get_complex_task(child, action->task, TRUE); const char *task_s = task2text(task); action_t *child_action = find_first_action(child->actions, NULL, task_s, node); if (child_action) { enum pe_action_flags child_flags = child->cmds->action_flags(child_action, node); if (is_set(flags, pe_action_optional) && is_set(child_flags, pe_action_optional) == FALSE) { pe_rsc_trace(action->rsc, "%s is manditory because of %s", action->uuid, child_action->uuid); clear_bit(flags, pe_action_optional); pe_clear_action_bit(action, pe_action_optional); } if (safe_str_neq(task_s, action->task) && is_set(flags, pe_action_runnable) && is_set(child_flags, pe_action_runnable) == FALSE) { pe_rsc_trace(action->rsc, "%s is not runnable because of %s", action->uuid, child_action->uuid); clear_bit(flags, pe_action_runnable); pe_clear_action_bit(action, pe_action_runnable); } } else if (task != stop_rsc) { pe_rsc_trace(action->rsc, "%s is not runnable because of %s (not found in %s)", action->uuid, task_s, child->id); clear_bit(flags, pe_action_runnable); } } return flags; } enum pe_graph_flags group_update_actions(action_t * first, action_t * then, node_t * node, enum pe_action_flags flags, enum pe_action_flags filter, enum pe_ordering type) { GListPtr gIter = then->rsc->children; enum pe_graph_flags changed = pe_graph_none; CRM_ASSERT(then->rsc != NULL); changed |= native_update_actions(first, then, node, flags, filter, type); for (; gIter != NULL; gIter = gIter->next) { resource_t *child = (resource_t *) gIter->data; action_t *child_action = find_first_action(child->actions, NULL, then->task, node); if (child_action) { changed |= child->cmds->update_actions(first, child_action, node, flags, filter, type); } } return changed; } void group_rsc_location(resource_t * rsc, rsc_to_node_t * constraint) { GListPtr gIter = rsc->children; GListPtr saved = constraint->node_list_rh; GListPtr zero = node_list_dup(constraint->node_list_rh, TRUE, FALSE); gboolean reset_scores = TRUE; group_variant_data_t *group_data = NULL; get_group_variant_data(group_data, rsc); pe_rsc_debug(rsc, "Processing rsc_location %s for %s", constraint->id, rsc->id); native_rsc_location(rsc, constraint); for (; gIter != NULL; gIter = gIter->next) { resource_t *child_rsc = (resource_t *) gIter->data; child_rsc->cmds->rsc_location(child_rsc, constraint); if (group_data->colocated && reset_scores) { reset_scores = FALSE; constraint->node_list_rh = zero; } } constraint->node_list_rh = saved; g_list_free_full(zero, free); } void group_expand(resource_t * rsc, pe_working_set_t * data_set) { GListPtr gIter = rsc->children; pe_rsc_trace(rsc, "Processing actions from %s", rsc->id); CRM_CHECK(rsc != NULL, return); native_expand(rsc, data_set); for (; gIter != NULL; gIter = gIter->next) { resource_t *child_rsc = (resource_t *) gIter->data; child_rsc->cmds->expand(child_rsc, data_set); } } GHashTable * group_merge_weights(resource_t * rsc, const char *rhs, GHashTable * nodes, const char *attr, float factor, enum pe_weights flags) { GListPtr gIter = rsc->rsc_cons_lhs; group_variant_data_t *group_data = NULL; get_group_variant_data(group_data, rsc); if (is_set(rsc->flags, pe_rsc_merging)) { pe_rsc_info(rsc, "Breaking dependency loop with %s at %s", rsc->id, rhs); return nodes; } set_bit(rsc->flags, pe_rsc_merging); nodes = group_data->first_child->cmds->merge_weights(group_data->first_child, rhs, nodes, attr, factor, flags); for (; gIter != NULL; gIter = gIter->next) { rsc_colocation_t *constraint = (rsc_colocation_t *) gIter->data; nodes = native_merge_weights(constraint->rsc_lh, rsc->id, nodes, constraint->node_attribute, (float) constraint->score / INFINITY, flags); } clear_bit(rsc->flags, pe_rsc_merging); return nodes; } void group_append_meta(resource_t * rsc, xmlNode * xml) { } diff --git a/pengine/regression.sh b/pengine/regression.sh index 637042a36b..f4a47783a7 100755 --- a/pengine/regression.sh +++ b/pengine/regression.sh @@ -1,657 +1,658 @@ #!/bin/bash # Copyright (C) 2004 Andrew Beekhof # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # core=`dirname $0` . $core/regression.core.sh create_mode="true" info Generating test outputs for these tests... # do_test file description info Done. echo "" info Performing the following tests from $io_dir create_mode="false" echo "" do_test simple1 "Offline " do_test simple2 "Start " do_test simple3 "Start 2 " do_test simple4 "Start Failed" do_test simple6 "Stop Start " do_test simple7 "Shutdown " #do_test simple8 "Stonith " #do_test simple9 "Lower version" #do_test simple10 "Higher version" do_test simple11 "Priority (ne)" do_test simple12 "Priority (eq)" do_test simple8 "Stickiness" echo "" do_test group1 "Group " do_test group2 "Group + Native " do_test group3 "Group + Group " do_test group4 "Group + Native (nothing)" do_test group5 "Group + Native (move) " do_test group6 "Group + Group (move) " do_test group7 "Group colocation" do_test group13 "Group colocation (cant run)" do_test group8 "Group anti-colocation" do_test group9 "Group recovery" do_test group10 "Group partial recovery" do_test group11 "Group target_role" do_test group14 "Group stop (graph terminated)" do_test group15 "-ve group colocation" do_test bug-1573 "Partial stop of a group with two children" do_test bug-1718 "Mandatory group ordering - Stop group_FUN" do_test bug-lf-2613 "Move group on failure" do_test bug-lf-2619 "Move group on clone failure" +do_test group-fail "Ensure stop order is preserved for partially active groups" echo "" do_test rsc_dep1 "Must not " do_test rsc_dep3 "Must " do_test rsc_dep5 "Must not 3 " do_test rsc_dep7 "Must 3 " do_test rsc_dep10 "Must (but cant)" do_test rsc_dep2 "Must (running) " do_test rsc_dep8 "Must (running : alt) " do_test rsc_dep4 "Must (running + move)" do_test asymmetric "Asymmetric - require explicit location constraints" echo "" do_test orphan-0 "Orphan ignore" do_test orphan-1 "Orphan stop" do_test orphan-2 "Orphan stop, remove failcount" echo "" do_test params-0 "Params: No change" do_test params-1 "Params: Changed" do_test params-2 "Params: Resource definition" do_test params-4 "Params: Reload" do_test params-5 "Params: Restart based on probe digest" do_test novell-251689 "Resource definition change + target_role=stopped" do_test bug-lf-2106 "Restart all anonymous clone instances after config change" do_test params-6 "Params: Detect reload in previously migrated resource" echo "" do_test target-0 "Target Role : baseline" do_test target-1 "Target Role : master" do_test target-2 "Target Role : invalid" echo "" do_test domain "Failover domains" do_test base-score "Set a node's default score for all nodes" echo "" do_test date-1 "Dates" -t "2005-020" do_test date-2 "Date Spec - Pass" -t "2005-020T12:30" do_test date-3 "Date Spec - Fail" -t "2005-020T11:30" do_test probe-0 "Probe (anon clone)" do_test probe-1 "Pending Probe" do_test probe-2 "Correctly re-probe cloned groups" do_test probe-3 "Probe (pending node)" do_test probe-4 "Probe (pending node + stopped resource)" --rc 4 do_test standby "Standby" do_test comments "Comments" echo "" do_test one-or-more-0 "Everything starts" do_test one-or-more-1 "Nothing starts because of A" do_test one-or-more-2 "D can start because of C" do_test one-or-more-3 "D cannot start because of B and C" do_test one-or-more-4 "D cannot start because of target-role" do_test one-or-more-5 "Start A and F even though C and D are stopped" do_test one-or-more-6 "Leave A running even though B is stopped" do_test one-or-more-7 "Leave A running even though C is stopped" echo "" do_test order1 "Order start 1 " do_test order2 "Order start 2 " do_test order3 "Order stop " do_test order4 "Order (multiple) " do_test order5 "Order (move) " do_test order6 "Order (move w/ restart) " do_test order7 "Order (manditory) " do_test order-optional "Order (score=0) " do_test order-required "Order (score=INFINITY) " do_test bug-lf-2171 "Prevent group start when clone is stopped" do_test order-clone "Clone ordering should be able to prevent startup of dependant clones" do_test order-sets "Ordering for resource sets" do_test order-serialize "Serialize resources without inhibiting migration" do_test order-serialize-set "Serialize a set of resources without inhibiting migration" do_test clone-order-primitive "Order clone start after a primitive" do_test order-optional-keyword "Order (optional keyword)" do_test order-mandatory "Order (mandatory keyword)" do_test bug-lf-2493 "Don't imply colocation requirements when applying ordering constraints with clones" do_test ordered-set-basic-startup "Constraint set with default order settings." do_test order-wrong-kind "Order (error)" echo "" do_test coloc-loop "Colocation - loop" do_test coloc-many-one "Colocation - many-to-one" do_test coloc-list "Colocation - many-to-one with list" do_test coloc-group "Colocation - groups" do_test coloc-slave-anti "Anti-colocation with slave shouldn't prevent master colocation" do_test coloc-attr "Colocation based on node attributes" do_test coloc-negative-group "Negative colocation with a group" do_test coloc-intra-set "Intra-set colocation" do_test bug-lf-2435 "Colocation sets with a negative score" do_test coloc-clone-stays-active "Ensure clones don't get stopped/demoted because a dependant must stop" do_test coloc_fp_logic "Verify floating point calculations in colocation are working" do_test colo_master_w_native "cl#5070 - Verify promotion order is affected when colocating master to native rsc." do_test colo_slave_w_native "cl#5070 - Verify promotion order is affected when colocating slave to native rsc." echo "" do_test rsc-sets-seq-true "Resource Sets - sequential=false" do_test rsc-sets-seq-false "Resource Sets - sequential=true" do_test rsc-sets-clone "Resource Sets - Clone" do_test rsc-sets-master "Resource Sets - Master" do_test rsc-sets-clone-1 "Resource Sets - Clone (lf#2404)" #echo "" #do_test agent1 "version: lt (empty)" #do_test agent2 "version: eq " #do_test agent3 "version: gt " echo "" do_test attrs1 "string: eq (and) " do_test attrs2 "string: lt / gt (and)" do_test attrs3 "string: ne (or) " do_test attrs4 "string: exists " do_test attrs5 "string: not_exists " do_test attrs6 "is_dc: true " do_test attrs7 "is_dc: false " do_test attrs8 "score_attribute " echo "" do_test mon-rsc-1 "Schedule Monitor - start" do_test mon-rsc-2 "Schedule Monitor - move " do_test mon-rsc-3 "Schedule Monitor - pending start " do_test mon-rsc-4 "Schedule Monitor - move/pending start" echo "" do_test rec-rsc-0 "Resource Recover - no start " do_test rec-rsc-1 "Resource Recover - start " do_test rec-rsc-2 "Resource Recover - monitor " do_test rec-rsc-3 "Resource Recover - stop - ignore" do_test rec-rsc-4 "Resource Recover - stop - block " do_test rec-rsc-5 "Resource Recover - stop - fence " do_test rec-rsc-6 "Resource Recover - multiple - restart" do_test rec-rsc-7 "Resource Recover - multiple - stop " do_test rec-rsc-8 "Resource Recover - multiple - block " do_test rec-rsc-9 "Resource Recover - group/group" echo "" do_test quorum-1 "No quorum - ignore" do_test quorum-2 "No quorum - freeze" do_test quorum-3 "No quorum - stop " do_test quorum-4 "No quorum - start anyway" do_test quorum-5 "No quorum - start anyway (group)" do_test quorum-6 "No quorum - start anyway (clone)" echo "" do_test rec-node-1 "Node Recover - Startup - no fence" do_test rec-node-2 "Node Recover - Startup - fence " do_test rec-node-3 "Node Recover - HA down - no fence" do_test rec-node-4 "Node Recover - HA down - fence " do_test rec-node-5 "Node Recover - CRM down - no fence" do_test rec-node-6 "Node Recover - CRM down - fence " do_test rec-node-7 "Node Recover - no quorum - ignore " do_test rec-node-8 "Node Recover - no quorum - freeze " do_test rec-node-9 "Node Recover - no quorum - stop " do_test rec-node-10 "Node Recover - no quorum - stop w/fence" do_test rec-node-11 "Node Recover - CRM down w/ group - fence " do_test rec-node-12 "Node Recover - nothing active - fence " do_test rec-node-13 "Node Recover - failed resource + shutdown - fence " do_test rec-node-15 "Node Recover - unknown lrm section" do_test rec-node-14 "Serialize all stonith's" echo "" do_test multi1 "Multiple Active (stop/start)" echo "" do_test migrate-begin "Normal migration" do_test migrate-success "Completed migration" do_test migrate-partial-1 "Completed migration, missing stop on source" do_test migrate-partial-2 "Successful migrate_to only" do_test migrate-partial-3 "Successful migrate_to only, target down" do_test migrate-partial-4 "Migrate from the correct host after migrate_to+migrate_from" do_test migrate-fail-2 "Failed migrate_from" do_test migrate-fail-3 "Failed migrate_from + stop on source" do_test migrate-fail-4 "Failed migrate_from + stop on target - ideally we wouldn't need to re-stop on target" do_test migrate-fail-5 "Failed migrate_from + stop on source and target" do_test migrate-fail-6 "Failed migrate_to" do_test migrate-fail-7 "Failed migrate_to + stop on source" do_test migrate-fail-8 "Failed migrate_to + stop on target - ideally we wouldn't need to re-stop on target" do_test migrate-fail-9 "Failed migrate_to + stop on source and target" do_test migrate-stop "Migration in a stopping stack" do_test migrate-start "Migration in a starting stack" do_test migrate-stop_start "Migration in a restarting stack" do_test migrate-stop-complex "Migration in a complex stopping stack" do_test migrate-start-complex "Migration in a complex starting stack" do_test migrate-stop-start-complex "Migration in a complex moving stack" do_test migrate-shutdown "Order the post-migration 'stop' before node shutdown" do_test migrate-1 "Migrate (migrate)" do_test migrate-2 "Migrate (stable)" do_test migrate-3 "Migrate (failed migrate_to)" do_test migrate-4 "Migrate (failed migrate_from)" do_test novell-252693 "Migration in a stopping stack" do_test novell-252693-2 "Migration in a starting stack" do_test novell-252693-3 "Non-Migration in a starting and stopping stack" do_test bug-1820 "Migration in a group" do_test bug-1820-1 "Non-migration in a group" do_test migrate-5 "Primitive migration with a clone" do_test migrate-fencing "Migration after Fencing" #echo "" #do_test complex1 "Complex " do_test bug-lf-2422 "Dependancy on partially active group - stop ocfs:*" echo "" do_test clone-anon-probe-1 "Probe the correct (anonymous) clone instance for each node" do_test clone-anon-probe-2 "Avoid needless re-probing of anonymous clones" do_test clone-anon-failcount "Merge failcounts for anonymous clones" do_test inc0 "Incarnation start" do_test inc1 "Incarnation start order" do_test inc2 "Incarnation silent restart, stop, move" do_test inc3 "Inter-incarnation ordering, silent restart, stop, move" do_test inc4 "Inter-incarnation ordering, silent restart, stop, move (ordered)" do_test inc5 "Inter-incarnation ordering, silent restart, stop, move (restart 1)" do_test inc6 "Inter-incarnation ordering, silent restart, stop, move (restart 2)" do_test inc7 "Clone colocation" do_test inc8 "Clone anti-colocation" do_test inc9 "Non-unique clone" do_test inc10 "Non-unique clone (stop)" do_test inc11 "Primitive colocation with clones" do_test inc12 "Clone shutdown" do_test cloned-group "Make sure only the correct number of cloned groups are started" do_test clone-no-shuffle "Dont prioritize allocation of instances that must be moved" do_test clone-max-zero "Orphan processing with clone-max=0" do_test clone-anon-dup "Bug LF#2087 - Correctly parse the state of anonymous clones that are active more than once per node" do_test bug-lf-2160 "Dont shuffle clones due to colocation" do_test bug-lf-2213 "clone-node-max enforcement for cloned groups" do_test bug-lf-2153 "Clone ordering constraints" do_test bug-lf-2361 "Ensure clones observe mandatory ordering constraints if the LHS is unrunnable" do_test bug-lf-2317 "Avoid needless restart of primitive depending on a clone" do_test clone-colocate-instance-1 "Colocation with a specific clone instance (negative example)" do_test clone-colocate-instance-2 "Colocation with a specific clone instance" do_test clone-order-instance "Ordering with specific clone instances" do_test bug-lf-2453 "Enforce mandatory clone ordering without colocation" do_test bug-lf-2508 "Correctly reconstruct the status of anonymous cloned groups" do_test bug-lf-2544 "Balanced clone placement" do_test bug-lf-2445 "Redistribute clones with node-max > 1 and stickiness = 0" do_test bug-lf-2574 "Avoid clone shuffle" do_test bug-lf-2581 "Avoid group restart due to unrelated clone (re)start" echo "" do_test master-0 "Stopped -> Slave" do_test master-1 "Stopped -> Promote" do_test master-2 "Stopped -> Promote : notify" do_test master-3 "Stopped -> Promote : master location" do_test master-4 "Started -> Promote : master location" do_test master-5 "Promoted -> Promoted" do_test master-6 "Promoted -> Promoted (2)" do_test master-7 "Promoted -> Fenced" do_test master-8 "Promoted -> Fenced -> Moved" do_test master-9 "Stopped + Promotable + No quorum" do_test master-10 "Stopped -> Promotable : notify with monitor" do_test master-11 "Stopped -> Promote : colocation" do_test novell-239082 "Demote/Promote ordering" do_test novell-239087 "Stable master placement" do_test master-12 "Promotion based solely on rsc_location constraints" do_test master-13 "Include preferences of colocated resources when placing master" do_test master-demote "Ordering when actions depends on demoting a slave resource" do_test master-ordering "Prevent resources from starting that need a master" do_test bug-1765 "Master-Master Colocation (dont stop the slaves)" do_test master-group "Promotion of cloned groups" do_test bug-lf-1852 "Don't shuffle master/slave instances unnecessarily" do_test master-failed-demote "Dont retry failed demote actions" do_test master-failed-demote-2 "Dont retry failed demote actions (notify=false)" do_test master-depend "Ensure resources that depend on the master don't get allocated until the master does" do_test master-reattach "Re-attach to a running master" do_test master-allow-start "Don't include master score if it would prevent allocation" do_test master-colocation "Allow master instances placemaker to be influenced by colocation constraints" do_test master-pseudo "Make sure promote/demote pseudo actions are created correctly" do_test master-role "Prevent target-role from promoting more than master-max instances" do_test bug-lf-2358 "Master-Master anti-colocation" do_test master-promotion-constraint "Mandatory master colocation constraints" do_test unmanaged-master "Ensure role is preserved for unmanaged resources" do_test master-unmanaged-monitor "Start the correct monitor operation for unmanaged masters" do_test master-demote-2 "Demote does not clear past failure" do_test master-move "Move master based on failure of colocated group" do_test master-probed-score "Observe the promotion score of probed resources" do_test colocation_constraint_stops_master "cl#5054 - Ensure master is demoted when stopped by colocation constraint" do_test colocation_constraint_stops_slave "cl#5054 - Ensure slave is not demoted when stopped by colocation constraint" do_test order_constraint_stops_master "cl#5054 - Ensure master is demoted when stopped by order constraint" do_test order_constraint_stops_slave "cl#5054 - Ensure slave is not demoted when stopped by order constraint" do_test master_monitor_restart "cl#5072 - Ensure master monitor operation will start after promotion." echo "" do_test history-1 "Correctly parse stateful-1 resource state" echo "" do_test managed-0 "Managed (reference)" do_test managed-1 "Not managed - down " do_test managed-2 "Not managed - up " do_test bug-5028 "Shutdown should block if anything depends on an unmanaged resource" do_test bug-5028-detach "Ensure detach still works" do_test bug-5028-bottom "Ensure shutdown still blocks if the blocked resource is at the bottom of the stack" echo "" do_test interleave-0 "Interleave (reference)" do_test interleave-1 "coloc - not interleaved" do_test interleave-2 "coloc - interleaved " do_test interleave-3 "coloc - interleaved (2)" do_test interleave-pseudo-stop "Interleaved clone during stonith" do_test interleave-stop "Interleaved clone during stop" do_test interleave-restart "Interleaved clone during dependancy restart" echo "" do_test notify-0 "Notify reference" do_test notify-1 "Notify simple" do_test notify-2 "Notify simple, confirm" do_test notify-3 "Notify move, confirm" do_test novell-239079 "Notification priority" #do_test notify-2 "Notify - 764" echo "" do_test 594 "OSDL #594 - Unrunnable actions scheduled in transition" do_test 662 "OSDL #662 - Two resources start on one node when incarnation_node_max = 1" do_test 696 "OSDL #696 - CRM starts stonith RA without monitor" do_test 726 "OSDL #726 - Attempting to schedule rsc_posic041_monitor_5000 _after_ a stop" do_test 735 "OSDL #735 - Correctly detect that rsc_hadev1 is stopped on hadev3" do_test 764 "OSDL #764 - Missing monitor op for DoFencing:child_DoFencing:1" do_test 797 "OSDL #797 - Assert triggered: task_id_i > max_call_id" do_test 829 "OSDL #829" do_test 994 "OSDL #994 - Stopping the last resource in a resource group causes the entire group to be restarted" do_test 994-2 "OSDL #994 - with a dependant resource" do_test 1360 "OSDL #1360 - Clone stickiness" do_test 1484 "OSDL #1484 - on_fail=stop" do_test 1494 "OSDL #1494 - Clone stability" do_test unrunnable-1 "Unrunnable" do_test stonith-0 "Stonith loop - 1" do_test stonith-1 "Stonith loop - 2" do_test stonith-2 "Stonith loop - 3" do_test stonith-3 "Stonith startup" do_test stonith-4 "Stonith node state" --rc 4 do_test bug-1572-1 "Recovery of groups depending on master/slave" do_test bug-1572-2 "Recovery of groups depending on master/slave when the master is never re-promoted" do_test bug-1685 "Depends-on-master ordering" do_test bug-1822 "Dont promote partially active groups" do_test bug-pm-11 "New resource added to a m/s group" do_test bug-pm-12 "Recover only the failed portion of a cloned group" do_test bug-n-387749 "Don't shuffle clone instances" do_test bug-n-385265 "Don't ignore the failure stickiness of group children - resource_idvscommon should stay stopped" do_test bug-n-385265-2 "Ensure groups are migrated instead of remaining partially active on the current node" do_test bug-lf-1920 "Correctly handle probes that find active resources" do_test bnc-515172 "Location constraint with multiple expressions" do_test colocate-primitive-with-clone "Optional colocation with a clone" do_test use-after-free-merge "Use-after-free in native_merge_weights" do_test bug-lf-2551 "STONITH ordering for stop" do_test bug-lf-2606 "Stonith implies demote" do_test bug-lf-2474 "Ensure resource op timeout takes precedence over op_defaults" do_test bug-suse-707150 "Prevent vm-01 from starting due to colocation/ordering" do_test bug-5014-A-start-B-start "Verify when A starts B starts using symmetrical=false" do_test bug-5014-A-stop-B-started "Verify when A stops B does not stop if it has already started using symmetric=false" do_test bug-5014-A-stopped-B-stopped "Verify when A is stopped and B has not started, B does not start before A using symmetric=false" do_test bug-5014-CthenAthenB-C-stopped "Verify when C then A is symmetrical=true, A then B is symmetric=false, and C is stopped that nothing starts." do_test bug-5014-CLONE-A-start-B-start "Verify when A starts B starts using clone resources with symmetric=false" do_test bug-5014-CLONE-A-stop-B-started "Verify when A stops B does not stop if it has already started using clone resources with symmetric=false." do_test bug-5014-GROUP-A-start-B-start "Verify when A starts B starts when using group resources with symmetric=false." do_test bug-5014-GROUP-A-stopped-B-started "Verify when A stops B does not stop if it has already started using group resources with symmetric=false." do_test bug-5014-GROUP-A-stopped-B-stopped "Verify when A is stopped and B has not started, B does not start before A using group resources with symmetric=false." do_test bug-5014-ordered-set-symmetrical-false "Verify ordered sets work with symmetrical=false" do_test bug-5014-ordered-set-symmetrical-true "Verify ordered sets work with symmetrical=true" do_test bug-5007-masterslave_colocation "Verify use of colocation scores other than INFINITY and -INFINITY work on multi-state resources." do_test bug-5038 "Prevent restart of anonymous clones when clone-max decreases" do_test bug-5025-1 "Automatically clean up failcount after resource config change with reload" do_test bug-5025-2 "Make sure clear failcount action isn't set when config does not change." do_test bug-5025-3 "Automatically clean up failcount after resource config change with restart" do_test failcount "Ensure failcounts are correctly expired" do_test monitor-onfail-restart "bug-5058 - Monitor failure with on-fail set to restart" do_test monitor-onfail-stop "bug-5058 - Monitor failure wiht on-fail set to stop" do_test bug-5059 "No need to restart p_stateful1:*" do_test bug-5069-op-enabled "Test on-fail=ignore with failure when monitor is enabled." do_test bug-5069-op-disabled "Test on-fail-ignore with failure when monitor is disabled." do_test ignore_stonith_rsc_order1 "cl#5056- Ignore order constraint between stonith and non-stonith rsc." do_test ignore_stonith_rsc_order2 "cl#5056- Ignore order constraint with group rsc containing mixed stonith and non-stonith." do_test ignore_stonith_rsc_order3 "cl#5056- Ignore order constraint, stonith clone and mixed group" do_test ignore_stonith_rsc_order4 "cl#5056- Ignore order constraint, stonith clone and clone with nested mixed group" do_test honor_stonith_rsc_order1 "cl#5056- Honor order constraint, stonith clone and pure stonith group(single rsc)." do_test honor_stonith_rsc_order2 "cl#5056- Honor order constraint, stonith clone and pure stonith group(multiple rsc)" do_test honor_stonith_rsc_order3 "cl#5056- Honor order constraint, stonith clones with nested pure stonith group." do_test honor_stonith_rsc_order4 "cl#5056- Honor order constraint, between two native stonith rscs." echo "" do_test systemhealth1 "System Health () #1" do_test systemhealth2 "System Health () #2" do_test systemhealth3 "System Health () #3" do_test systemhealthn1 "System Health (None) #1" do_test systemhealthn2 "System Health (None) #2" do_test systemhealthn3 "System Health (None) #3" do_test systemhealthm1 "System Health (Migrate On Red) #1" do_test systemhealthm2 "System Health (Migrate On Red) #2" do_test systemhealthm3 "System Health (Migrate On Red) #3" do_test systemhealtho1 "System Health (Only Green) #1" do_test systemhealtho2 "System Health (Only Green) #2" do_test systemhealtho3 "System Health (Only Green) #3" do_test systemhealthp1 "System Health (Progessive) #1" do_test systemhealthp2 "System Health (Progessive) #2" do_test systemhealthp3 "System Health (Progessive) #3" echo "" do_test utilization "Placement Strategy - utilization" do_test minimal "Placement Strategy - minimal" do_test balanced "Placement Strategy - balanced" echo "" do_test placement-stickiness "Optimized Placement Strategy - stickiness" do_test placement-priority "Optimized Placement Strategy - priority" do_test placement-location "Optimized Placement Strategy - location" do_test placement-capacity "Optimized Placement Strategy - capacity" echo "" do_test utilization-order1 "Utilization Order - Simple" do_test utilization-order2 "Utilization Order - Complex" do_test utilization-order3 "Utilization Order - Migrate" do_test utilization-order4 "Utilization Order - Live Mirgration (bnc#695440)" do_test utilization-shuffle "Don't displace prmExPostgreSQLDB2 on act2, Start prmExPostgreSQLDB1 on act3" do_test load-stopped-loop "Avoid transition loop due to load_stopped (cl#5044)" echo "" do_test reprobe-target_rc "Ensure correct target_rc for reprobe of inactive resources" echo "" do_test stopped-monitor-00 "Stopped Monitor - initial start" do_test stopped-monitor-01 "Stopped Monitor - failed started" do_test stopped-monitor-02 "Stopped Monitor - started multi-up" do_test stopped-monitor-03 "Stopped Monitor - stop started" do_test stopped-monitor-04 "Stopped Monitor - failed stop" do_test stopped-monitor-05 "Stopped Monitor - start unmanaged" do_test stopped-monitor-06 "Stopped Monitor - unmanaged multi-up" do_test stopped-monitor-07 "Stopped Monitor - start unmanaged multi-up" do_test stopped-monitor-08 "Stopped Monitor - migrate" do_test stopped-monitor-09 "Stopped Monitor - unmanage started" do_test stopped-monitor-10 "Stopped Monitor - unmanaged started multi-up" do_test stopped-monitor-11 "Stopped Monitor - stop unmanaged started" do_test stopped-monitor-12 "Stopped Monitor - unmanaged started multi-up (targer-role="Stopped")" do_test stopped-monitor-20 "Stopped Monitor - initial stop" do_test stopped-monitor-21 "Stopped Monitor - stopped single-up" do_test stopped-monitor-22 "Stopped Monitor - stopped multi-up" do_test stopped-monitor-23 "Stopped Monitor - start stopped" do_test stopped-monitor-24 "Stopped Monitor - unmanage stopped" do_test stopped-monitor-25 "Stopped Monitor - unmanaged stopped multi-up" do_test stopped-monitor-26 "Stopped Monitor - start unmanaged stopped" do_test stopped-monitor-27 "Stopped Monitor - unmanaged stopped multi-up (target-role="Started")" do_test stopped-monitor-30 "Stopped Monitor - new node started" do_test stopped-monitor-31 "Stopped Monitor - new node stopped" echo"" do_test ticket-primitive-1 "Ticket - Primitive (loss-policy=stop, initial)" do_test ticket-primitive-2 "Ticket - Primitive (loss-policy=stop, granted)" do_test ticket-primitive-3 "Ticket - Primitive (loss-policy-stop, revoked)" do_test ticket-primitive-4 "Ticket - Primitive (loss-policy=demote, initial)" do_test ticket-primitive-5 "Ticket - Primitive (loss-policy=demote, granted)" do_test ticket-primitive-6 "Ticket - Primitive (loss-policy=demote, revoked)" do_test ticket-primitive-7 "Ticket - Primitive (loss-policy=fence, initial)" do_test ticket-primitive-8 "Ticket - Primitive (loss-policy=fence, granted)" do_test ticket-primitive-9 "Ticket - Primitive (loss-policy=fence, revoked)" do_test ticket-primitive-10 "Ticket - Primitive (loss-policy=freeze, initial)" do_test ticket-primitive-11 "Ticket - Primitive (loss-policy=freeze, granted)" do_test ticket-primitive-12 "Ticket - Primitive (loss-policy=freeze, revoked)" do_test ticket-primitive-13 "Ticket - Primitive (loss-policy=stop, standby, granted)" do_test ticket-primitive-14 "Ticket - Primitive (loss-policy=stop, granted, standby)" do_test ticket-primitive-15 "Ticket - Primitive (loss-policy=stop, standby, revoked)" do_test ticket-primitive-16 "Ticket - Primitive (loss-policy=demote, standby, granted)" do_test ticket-primitive-17 "Ticket - Primitive (loss-policy=demote, granted, standby)" do_test ticket-primitive-18 "Ticket - Primitive (loss-policy=demote, standby, revoked)" do_test ticket-primitive-19 "Ticket - Primitive (loss-policy=fence, standby, granted)" do_test ticket-primitive-20 "Ticket - Primitive (loss-policy=fence, granted, standby)" do_test ticket-primitive-21 "Ticket - Primitive (loss-policy=fence, standby, revoked)" do_test ticket-primitive-22 "Ticket - Primitive (loss-policy=freeze, standby, granted)" do_test ticket-primitive-23 "Ticket - Primitive (loss-policy=freeze, granted, standby)" do_test ticket-primitive-24 "Ticket - Primitive (loss-policy=freeze, standby, revoked)" echo"" do_test ticket-group-1 "Ticket - Group (loss-policy=stop, initial)" do_test ticket-group-2 "Ticket - Group (loss-policy=stop, granted)" do_test ticket-group-3 "Ticket - Group (loss-policy-stop, revoked)" do_test ticket-group-4 "Ticket - Group (loss-policy=demote, initial)" do_test ticket-group-5 "Ticket - Group (loss-policy=demote, granted)" do_test ticket-group-6 "Ticket - Group (loss-policy=demote, revoked)" do_test ticket-group-7 "Ticket - Group (loss-policy=fence, initial)" do_test ticket-group-8 "Ticket - Group (loss-policy=fence, granted)" do_test ticket-group-9 "Ticket - Group (loss-policy=fence, revoked)" do_test ticket-group-10 "Ticket - Group (loss-policy=freeze, initial)" do_test ticket-group-11 "Ticket - Group (loss-policy=freeze, granted)" do_test ticket-group-12 "Ticket - Group (loss-policy=freeze, revoked)" do_test ticket-group-13 "Ticket - Group (loss-policy=stop, standby, granted)" do_test ticket-group-14 "Ticket - Group (loss-policy=stop, granted, standby)" do_test ticket-group-15 "Ticket - Group (loss-policy=stop, standby, revoked)" do_test ticket-group-16 "Ticket - Group (loss-policy=demote, standby, granted)" do_test ticket-group-17 "Ticket - Group (loss-policy=demote, granted, standby)" do_test ticket-group-18 "Ticket - Group (loss-policy=demote, standby, revoked)" do_test ticket-group-19 "Ticket - Group (loss-policy=fence, standby, granted)" do_test ticket-group-20 "Ticket - Group (loss-policy=fence, granted, standby)" do_test ticket-group-21 "Ticket - Group (loss-policy=fence, standby, revoked)" do_test ticket-group-22 "Ticket - Group (loss-policy=freeze, standby, granted)" do_test ticket-group-23 "Ticket - Group (loss-policy=freeze, granted, standby)" do_test ticket-group-24 "Ticket - Group (loss-policy=freeze, standby, revoked)" echo"" do_test ticket-clone-1 "Ticket - Clone (loss-policy=stop, initial)" do_test ticket-clone-2 "Ticket - Clone (loss-policy=stop, granted)" do_test ticket-clone-3 "Ticket - Clone (loss-policy-stop, revoked)" do_test ticket-clone-4 "Ticket - Clone (loss-policy=demote, initial)" do_test ticket-clone-5 "Ticket - Clone (loss-policy=demote, granted)" do_test ticket-clone-6 "Ticket - Clone (loss-policy=demote, revoked)" do_test ticket-clone-7 "Ticket - Clone (loss-policy=fence, initial)" do_test ticket-clone-8 "Ticket - Clone (loss-policy=fence, granted)" do_test ticket-clone-9 "Ticket - Clone (loss-policy=fence, revoked)" do_test ticket-clone-10 "Ticket - Clone (loss-policy=freeze, initial)" do_test ticket-clone-11 "Ticket - Clone (loss-policy=freeze, granted)" do_test ticket-clone-12 "Ticket - Clone (loss-policy=freeze, revoked)" do_test ticket-clone-13 "Ticket - Clone (loss-policy=stop, standby, granted)" do_test ticket-clone-14 "Ticket - Clone (loss-policy=stop, granted, standby)" do_test ticket-clone-15 "Ticket - Clone (loss-policy=stop, standby, revoked)" do_test ticket-clone-16 "Ticket - Clone (loss-policy=demote, standby, granted)" do_test ticket-clone-17 "Ticket - Clone (loss-policy=demote, granted, standby)" do_test ticket-clone-18 "Ticket - Clone (loss-policy=demote, standby, revoked)" do_test ticket-clone-19 "Ticket - Clone (loss-policy=fence, standby, granted)" do_test ticket-clone-20 "Ticket - Clone (loss-policy=fence, granted, standby)" do_test ticket-clone-21 "Ticket - Clone (loss-policy=fence, standby, revoked)" do_test ticket-clone-22 "Ticket - Clone (loss-policy=freeze, standby, granted)" do_test ticket-clone-23 "Ticket - Clone (loss-policy=freeze, granted, standby)" do_test ticket-clone-24 "Ticket - Clone (loss-policy=freeze, standby, revoked)" echo"" do_test ticket-master-1 "Ticket - Master (loss-policy=stop, initial)" do_test ticket-master-2 "Ticket - Master (loss-policy=stop, granted)" do_test ticket-master-3 "Ticket - Master (loss-policy-stop, revoked)" do_test ticket-master-4 "Ticket - Master (loss-policy=demote, initial)" do_test ticket-master-5 "Ticket - Master (loss-policy=demote, granted)" do_test ticket-master-6 "Ticket - Master (loss-policy=demote, revoked)" do_test ticket-master-7 "Ticket - Master (loss-policy=fence, initial)" do_test ticket-master-8 "Ticket - Master (loss-policy=fence, granted)" do_test ticket-master-9 "Ticket - Master (loss-policy=fence, revoked)" do_test ticket-master-10 "Ticket - Master (loss-policy=freeze, initial)" do_test ticket-master-11 "Ticket - Master (loss-policy=freeze, granted)" do_test ticket-master-12 "Ticket - Master (loss-policy=freeze, revoked)" do_test ticket-master-13 "Ticket - Master (loss-policy=stop, standby, granted)" do_test ticket-master-14 "Ticket - Master (loss-policy=stop, granted, standby)" do_test ticket-master-15 "Ticket - Master (loss-policy=stop, standby, revoked)" do_test ticket-master-16 "Ticket - Master (loss-policy=demote, standby, granted)" do_test ticket-master-17 "Ticket - Master (loss-policy=demote, granted, standby)" do_test ticket-master-18 "Ticket - Master (loss-policy=demote, standby, revoked)" do_test ticket-master-19 "Ticket - Master (loss-policy=fence, standby, granted)" do_test ticket-master-20 "Ticket - Master (loss-policy=fence, granted, standby)" do_test ticket-master-21 "Ticket - Master (loss-policy=fence, standby, revoked)" do_test ticket-master-22 "Ticket - Master (loss-policy=freeze, standby, granted)" do_test ticket-master-23 "Ticket - Master (loss-policy=freeze, granted, standby)" do_test ticket-master-24 "Ticket - Master (loss-policy=freeze, standby, revoked)" echo "" do_test ticket-rsc-sets-1 "Ticket - Resource sets (1 ticket, initial)" do_test ticket-rsc-sets-2 "Ticket - Resource sets (1 ticket, granted)" do_test ticket-rsc-sets-3 "Ticket - Resource sets (1 ticket, revoked)" do_test ticket-rsc-sets-4 "Ticket - Resource sets (2 tickets, initial)" do_test ticket-rsc-sets-5 "Ticket - Resource sets (2 tickets, granted)" do_test ticket-rsc-sets-6 "Ticket - Resource sets (2 tickets, granted)" do_test ticket-rsc-sets-7 "Ticket - Resource sets (2 tickets, revoked)" do_test ticket-rsc-sets-8 "Ticket - Resource sets (1 ticket, standby, granted)" do_test ticket-rsc-sets-9 "Ticket - Resource sets (1 ticket, granted, standby)" do_test ticket-rsc-sets-10 "Ticket - Resource sets (1 ticket, standby, revoked)" do_test ticket-rsc-sets-11 "Ticket - Resource sets (2 tickets, standby, granted)" do_test ticket-rsc-sets-12 "Ticket - Resource sets (2 tickets, standby, granted)" do_test ticket-rsc-sets-13 "Ticket - Resource sets (2 tickets, granted, standby)" do_test ticket-rsc-sets-14 "Ticket - Resource sets (2 tickets, standby, revoked)" echo "" do_test template-1 "Template - 1" do_test template-2 "Template - 2" do_test template-3 "Template - 3 (merge operations)" do_test template-coloc-1 "Template - Colocation 1" do_test template-coloc-2 "Template - Colocation 2" do_test template-coloc-3 "Template - Colocation 3" do_test template-order-1 "Template - Order 1" do_test template-order-2 "Template - Order 2" do_test template-order-3 "Template - Order 3" do_test template-ticket "Template - Ticket" do_test template-rsc-sets-1 "Template - Resource Sets 1" do_test template-rsc-sets-2 "Template - Resource Sets 2" do_test template-rsc-sets-3 "Template - Resource Sets 3" do_test template-rsc-sets-4 "Template - Resource Sets 4" echo "" test_results diff --git a/pengine/test10/group-fail.dot b/pengine/test10/group-fail.dot new file mode 100644 index 0000000000..58aec718f2 --- /dev/null +++ b/pengine/test10/group-fail.dot @@ -0,0 +1,38 @@ +digraph "g" { +"all_stopped" [ style=bold color="green" fontcolor="orange"] +"group1_running_0" [ style=bold color="green" fontcolor="orange"] +"group1_start_0" -> "group1_running_0" [ style = bold] +"group1_start_0" -> "rsc1_start_0 node1" [ style = bold] +"group1_start_0" -> "rsc2_start_0 node1" [ style = bold] +"group1_start_0" -> "rsc3_start_0 node1" [ style = bold] +"group1_start_0" -> "rsc4_start_0 node1" [ style = bold] +"group1_start_0" [ style=bold color="green" fontcolor="orange"] +"group1_stop_0" -> "group1_stopped_0" [ style = bold] +"group1_stop_0" -> "rsc2_stop_0 node1" [ style = bold] +"group1_stop_0" -> "rsc4_stop_0 node1" [ style = bold] +"group1_stop_0" [ style=bold color="green" fontcolor="orange"] +"group1_stopped_0" -> "group1_start_0" [ style = bold] +"group1_stopped_0" [ style=bold color="green" fontcolor="orange"] +"probe_complete node1" [ style=bold color="green" fontcolor="black"] +"probe_complete node2" [ style=bold color="green" fontcolor="black"] +"rsc1_start_0 node1" -> "group1_running_0" [ style = bold] +"rsc1_start_0 node1" -> "rsc2_start_0 node1" [ style = bold] +"rsc1_start_0 node1" [ style=bold color="green" fontcolor="black"] +"rsc2_start_0 node1" -> "group1_running_0" [ style = bold] +"rsc2_start_0 node1" -> "rsc3_start_0 node1" [ style = bold] +"rsc2_start_0 node1" [ style=bold color="green" fontcolor="black"] +"rsc2_stop_0 node1" -> "all_stopped" [ style = bold] +"rsc2_stop_0 node1" -> "group1_stopped_0" [ style = bold] +"rsc2_stop_0 node1" -> "rsc2_start_0 node1" [ style = bold] +"rsc2_stop_0 node1" [ style=bold color="green" fontcolor="black"] +"rsc3_start_0 node1" -> "group1_running_0" [ style = bold] +"rsc3_start_0 node1" -> "rsc4_start_0 node1" [ style = bold] +"rsc3_start_0 node1" [ style=bold color="green" fontcolor="black"] +"rsc4_start_0 node1" -> "group1_running_0" [ style = bold] +"rsc4_start_0 node1" [ style=bold color="green" fontcolor="black"] +"rsc4_stop_0 node1" -> "all_stopped" [ style = bold] +"rsc4_stop_0 node1" -> "group1_stopped_0" [ style = bold] +"rsc4_stop_0 node1" -> "rsc2_stop_0 node1" [ style = bold] +"rsc4_stop_0 node1" -> "rsc4_start_0 node1" [ style = bold] +"rsc4_stop_0 node1" [ style=bold color="green" fontcolor="black"] +} diff --git a/pengine/test10/group-fail.exp b/pengine/test10/group-fail.exp new file mode 100644 index 0000000000..b9ed6c2cc3 --- /dev/null +++ b/pengine/test10/group-fail.exp @@ -0,0 +1,192 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pengine/test10/group-fail.scores b/pengine/test10/group-fail.scores new file mode 100644 index 0000000000..12cba91685 --- /dev/null +++ b/pengine/test10/group-fail.scores @@ -0,0 +1,19 @@ +Allocation scores: +group_color: group1 allocation score on node1: 0 +group_color: group1 allocation score on node2: 0 +group_color: rsc1 allocation score on node1: 0 +group_color: rsc1 allocation score on node2: 0 +group_color: rsc2 allocation score on node1: 0 +group_color: rsc2 allocation score on node2: 0 +group_color: rsc3 allocation score on node1: 0 +group_color: rsc3 allocation score on node2: 0 +group_color: rsc4 allocation score on node1: 0 +group_color: rsc4 allocation score on node2: 0 +native_color: rsc1 allocation score on node1: 0 +native_color: rsc1 allocation score on node2: 0 +native_color: rsc2 allocation score on node1: 0 +native_color: rsc2 allocation score on node2: -INFINITY +native_color: rsc3 allocation score on node1: 0 +native_color: rsc3 allocation score on node2: -INFINITY +native_color: rsc4 allocation score on node1: 0 +native_color: rsc4 allocation score on node2: -INFINITY diff --git a/pengine/test10/group-fail.summary b/pengine/test10/group-fail.summary new file mode 100644 index 0000000000..aa03d2121e --- /dev/null +++ b/pengine/test10/group-fail.summary @@ -0,0 +1,38 @@ + +Current cluster status: +Online: [ node1 node2 ] + + Resource Group: group1 + rsc1 (heartbeat:apache): Stopped + rsc2 (heartbeat:apache): Started node1 + rsc3 (heartbeat:apache): Stopped + rsc4 (heartbeat:apache): Started node1 + +Transition Summary: + * Start rsc1 (node1) + * Restart rsc2 (Started node1) + * Start rsc3 (node1) + * Restart rsc4 (Started node1) + +Executing cluster transition: + * Pseudo action: group1_stop_0 + * Resource action: rsc4 stop on node1 + * Resource action: rsc2 stop on node1 + * Pseudo action: all_stopped + * Pseudo action: group1_stopped_0 + * Pseudo action: group1_start_0 + * Resource action: rsc1 start on node1 + * Resource action: rsc2 start on node1 + * Resource action: rsc3 start on node1 + * Resource action: rsc4 start on node1 + * Pseudo action: group1_running_0 + +Revised cluster status: +Online: [ node1 node2 ] + + Resource Group: group1 + rsc1 (heartbeat:apache): Started node1 + rsc2 (heartbeat:apache): Started node1 + rsc3 (heartbeat:apache): Started node1 + rsc4 (heartbeat:apache): Started node1 + diff --git a/pengine/test10/group-fail.xml b/pengine/test10/group-fail.xml new file mode 100644 index 0000000000..15f26a78de --- /dev/null +++ b/pengine/test10/group-fail.xml @@ -0,0 +1,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pengine/test10/quorum-5.exp b/pengine/test10/quorum-5.exp index 1f31eb71e3..ccff65d6ec 100644 --- a/pengine/test10/quorum-5.exp +++ b/pengine/test10/quorum-5.exp @@ -1,135 +1,135 @@ - + diff --git a/pengine/test10/quorum-5.xml b/pengine/test10/quorum-5.xml index b338da65d7..9336ceaf12 100644 --- a/pengine/test10/quorum-5.xml +++ b/pengine/test10/quorum-5.xml @@ -1,47 +1,47 @@ - + diff --git a/tools/cibadmin.c b/tools/cibadmin.c index 4e884222cf..7d4e6c928c 100644 --- a/tools/cibadmin.c +++ b/tools/cibadmin.c @@ -1,598 +1,599 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include int exit_code = pcmk_ok; int message_timer_id = -1; int message_timeout_ms = 30; GMainLoop *mainloop = NULL; const char *host = NULL; void usage(const char *cmd, int exit_status); int do_init(void); int do_work(xmlNode * input, int command_options, xmlNode ** output); gboolean admin_message_timeout(gpointer data); void cib_connection_destroy(gpointer user_data); void cibadmin_op_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data); int command_options = 0; const char *cib_action = NULL; typedef struct str_list_s { int num_items; char *value; struct str_list_s *next; } str_list_t; char *obj_type = NULL; char *status = NULL; char *migrate_from = NULL; char *migrate_res = NULL; char *subtype = NULL; char *reset = NULL; int request_id = 0; int operation_status = 0; cib_t *the_cib = NULL; gboolean force_flag = FALSE; /* *INDENT-OFF* */ static struct crm_option long_options[] = { {"help", 0, 0, '?', "\tThis text"}, {"version", 0, 0, '$', "\tVersion information" }, {"verbose", 0, 0, 'V', "\tIncrease debug output\n"}, {"-spacer-", 0, 0, '-', "Commands:"}, {"upgrade", 0, 0, 'u', "\tUpgrade the configuration to the latest syntax"}, {"query", 0, 0, 'Q', "\tQuery the contents of the CIB"}, {"erase", 0, 0, 'E', "\tErase the contents of the whole CIB"}, {"bump", 0, 0, 'B', "\tIncrease the CIB's epoch value by 1"}, {"create", 0, 0, 'C', "\tCreate an object in the CIB. Will fail if the object already exists."}, {"modify", 0, 0, 'M', "\tFind the object somewhere in the CIB's XML tree and update it. Fails if the object does not exist unless -c is specified"}, {"patch", 0, 0, 'P', "\tSupply an update in the form of an xml diff (See also: crm_diff)"}, {"replace", 0, 0, 'R', "\tRecursivly replace an object in the CIB"}, {"delete", 0, 0, 'D', "\tDelete the first object matching the supplied criteria, Eg. "}, {"-spacer-", 0, 0, '-', "\n\t\t\tThe tagname and all attributes must match in order for the element to be deleted"}, {"delete-all", 0, 0, 'd', "\tWhen used with --xpath, remove all matching objects in the configuration instead of just the first one"}, {"md5-sum", 0, 0, '5', "\tCalculate the on-disk CIB digest"}, {"md5-sum-versioned", 0, 0, '6', "\tCalculate an on-the-wire versioned CIB digest"}, {"sync", 0, 0, 'S', "\t(Advanced) Force a refresh of the CIB to all nodes\n"}, {"make-slave", 0, 0, 'r', NULL, 1}, {"make-master", 0, 0, 'w', NULL, 1}, {"is-master", 0, 0, 'm', NULL, 1}, {"empty", 0, 0, 'a', "\tOutput an empty CIB"}, {"blank", 0, 0, 'a', NULL, 1}, {"-spacer-",1, 0, '-', "\nAdditional options:"}, {"force", 0, 0, 'f'}, {"timeout", 1, 0, 't', "Time (in seconds) to wait before declaring the operation failed"}, {"sync-call", 0, 0, 's', "Wait for call to complete before returning"}, {"local", 0, 0, 'l', "\tCommand takes effect locally. Should only be used for queries"}, {"allow-create",0, 0, 'c', "(Advanced) Allow the target of a -M operation to be created if they do not exist"}, {"no-children", 0, 0, 'n', "(Advanced) When querying an object, do not return include its children in the result\n"}, {"no-bcast", 0, 0, 'b', NULL, 1}, {"-spacer-", 0, 0, '-', "Data:"}, {"xml-text", 1, 0, 'X', "Retrieve XML from the supplied string"}, {"xml-file", 1, 0, 'x', "Retrieve XML from the named file"}, {"xml-pipe", 0, 0, 'p', "Retrieve XML from stdin\n"}, {"xpath", 1, 0, 'A', "A valid XPath to use instead of -o"}, {"scope", 1, 0, 'o', "Limit the scope of the operation to a specific section of the CIB."}, {"-spacer-", 0, 0, '-', "\t\t\tValid values are: nodes, resources, constraints, crm_config, rsc_defaults, op_defaults, status"}, {"node", 1, 0, 'N', "(Advanced) Send command to the specified host\n"}, {"-space-", 0, 0, '!', NULL, 1}, {"-spacer-", 0, 0, '-', "\nExamples:\n"}, {"-spacer-", 0, 0, '-', "Query the configuration from the local node:", pcmk_option_paragraph}, {"-spacer-", 0, 0, '-', " cibadmin --query --local", pcmk_option_example}, {"-spacer-", 0, 0, '-', "Query the just the cluster options configuration:", pcmk_option_paragraph}, {"-spacer-", 0, 0, '-', " cibadmin --query --scope crm_config", pcmk_option_example}, {"-spacer-", 0, 0, '-', "Query all 'target-role' settings:", pcmk_option_paragraph}, {"-spacer-", 0, 0, '-', " cibadmin --query --xpath \"//nvpair[@name='target-role']\"", pcmk_option_example}, {"-spacer-", 0, 0, '-', "Remove all 'is-managed' settings:", pcmk_option_paragraph}, {"-spacer-", 0, 0, '-', " cibadmin --delete-all --xpath \"//nvpair[@name='is-managed']\"", pcmk_option_example}, {"-spacer-", 0, 0, '-', "Remove the resource named 'old':", pcmk_option_paragraph}, {"-spacer-", 0, 0, '-', " cibadmin --delete --xml-text ''", pcmk_option_example}, {"-spacer-", 0, 0, '-', "Remove all resources from the configuration:", pcmk_option_paragraph}, {"-spacer-", 0, 0, '-', " cibadmin --replace --scope resources --xml-text ''", pcmk_option_example}, {"-spacer-", 0, 0, '-', "Replace the complete configuration with the contents of $HOME/pacemaker.xml:", pcmk_option_paragraph}, {"-spacer-", 0, 0, '-', " cibadmin --replace --xml-file $HOME/pacemaker.xml", pcmk_option_example}, {"-spacer-", 0, 0, '-', "Replace the constraints section of the configuration with the contents of $HOME/constraints.xml:", pcmk_option_paragraph}, {"-spacer-", 0, 0, '-', " cibadmin --replace --scope constraints --xml-file $HOME/constraints.xml", pcmk_option_example}, {"-spacer-", 0, 0, '-', "Increase the configuration version to prevent old configurations from being loaded accidentally:", pcmk_option_paragraph}, {"-spacer-", 0, 0, '-', " cibadmin --modify --xml-text ''", pcmk_option_example}, {"-spacer-", 0, 0, '-', "Edit the configuration with your favorite $EDITOR:", pcmk_option_paragraph}, {"-spacer-", 0, 0, '-', " cibadmin --query > $HOME/local.xml", pcmk_option_example}, {"-spacer-", 0, 0, '-', " $EDITOR $HOME/local.xml", pcmk_option_example}, {"-spacer-", 0, 0, '-', " cibadmin --replace --xml-file $HOME/local.xml", pcmk_option_example}, {"-spacer-", 0, 0, '-', "SEE ALSO:"}, {"-spacer-", 0, 0, '-', " CRM shell, crm(8), crm_shadow(8)"}, /* Legacy options */ {"host", 1, 0, 'h', NULL, 1}, {"force-quorum", 0, 0, 'f', NULL, 1}, {"obj_type", 1, 0, 'o', NULL, 1}, {F_CRM_DATA, 1, 0, 'X', NULL, 1}, {CIB_OP_ERASE, 0, 0, 'E', NULL, 1}, {CIB_OP_QUERY, 0, 0, 'Q', NULL, 1}, {CIB_OP_CREATE, 0, 0, 'C', NULL, 1}, {CIB_OP_REPLACE, 0, 0, 'R', NULL, 1}, {CIB_OP_UPDATE, 0, 0, 'U', NULL, 1}, {CIB_OP_MODIFY, 0, 0, 'M', NULL, 1}, {CIB_OP_DELETE, 0, 0, 'D', NULL, 1}, {CIB_OP_BUMP, 0, 0, 'B', NULL, 1}, {CIB_OP_SYNC, 0, 0, 'S', NULL, 1}, {CIB_OP_SLAVE, 0, 0, 'r', NULL, 1}, {CIB_OP_MASTER, 0, 0, 'w', NULL, 1}, {CIB_OP_ISMASTER,0, 0, 'm', NULL, 1}, {0, 0, 0, 0} }; /* *INDENT-ON* */ int main(int argc, char **argv) { int argerr = 0; int flag; const char *source = NULL; char *admin_input_xml = NULL; char *admin_input_file = NULL; gboolean dangerous_cmd = FALSE; gboolean admin_input_stdin = FALSE; xmlNode *output = NULL; xmlNode *input = NULL; int option_index = 0; crm_log_init(NULL, LOG_CRIT, FALSE, FALSE, argc, argv, FALSE); crm_set_options(NULL, "command [options] [data]", long_options, "Provides direct access to the cluster configuration." "\n\nAllows the configuration, or sections of it, to be queried, modified, replaced and deleted." "\n\nWhere necessary, XML data will be obtained using the -X, -x, or -p options\n"); if (argc < 2) { crm_help('?', EX_USAGE); } while (1) { flag = crm_get_option(argc, argv, &option_index); if (flag == -1) break; switch (flag) { case 't': message_timeout_ms = atoi(optarg); if (message_timeout_ms < 1) { message_timeout_ms = 30; } break; case 'A': obj_type = strdup(optarg); command_options |= cib_xpath; break; case 'u': cib_action = CIB_OP_UPGRADE; dangerous_cmd = TRUE; break; case 'E': cib_action = CIB_OP_ERASE; dangerous_cmd = TRUE; break; case 'Q': cib_action = CIB_OP_QUERY; break; case 'P': cib_action = CIB_OP_APPLY_DIFF; break; case 'S': cib_action = CIB_OP_SYNC; break; case 'U': case 'M': cib_action = CIB_OP_MODIFY; break; case 'R': cib_action = CIB_OP_REPLACE; break; case 'C': cib_action = CIB_OP_CREATE; break; case 'D': cib_action = CIB_OP_DELETE; break; case '5': cib_action = "md5-sum"; break; case '6': cib_action = "md5-sum-versioned"; break; case 'c': command_options |= cib_can_create; break; case 'n': command_options |= cib_no_children; break; case 'm': cib_action = CIB_OP_ISMASTER; command_options |= cib_scope_local; break; case 'B': cib_action = CIB_OP_BUMP; break; case 'r': dangerous_cmd = TRUE; cib_action = CIB_OP_SLAVE; break; case 'w': dangerous_cmd = TRUE; cib_action = CIB_OP_MASTER; command_options |= cib_scope_local; break; case 'V': command_options = command_options | cib_verbose; crm_bump_log_level(argc, argv); break; case '?': case '$': case '!': crm_help(flag, EX_OK); break; case 'o': crm_trace("Option %c => %s", flag, optarg); obj_type = strdup(optarg); break; case 'X': crm_trace("Option %c => %s", flag, optarg); admin_input_xml = strdup(optarg); break; case 'x': crm_trace("Option %c => %s", flag, optarg); admin_input_file = strdup(optarg); break; case 'p': admin_input_stdin = TRUE; break; + case 'N': case 'h': host = strdup(optarg); break; case 'l': command_options |= cib_scope_local; break; case 'd': cib_action = CIB_OP_DELETE; command_options |= cib_multiple; dangerous_cmd = TRUE; break; case 'b': dangerous_cmd = TRUE; command_options |= cib_inhibit_bcast; command_options |= cib_scope_local; break; case 's': command_options |= cib_sync_call; break; case 'f': force_flag = TRUE; command_options |= cib_quorum_override; break; case 'a': output = createEmptyCib(); crm_xml_add(output, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET); if (optind >= argc) { crm_xml_add(output, XML_ATTR_VALIDATION, LATEST_SCHEMA_VERSION); } else { crm_xml_add(output, XML_ATTR_VALIDATION, argv[optind]); } crm_xml_add_int(output, XML_ATTR_GENERATION_ADMIN, 1); crm_xml_add_int(output, XML_ATTR_GENERATION, 0); crm_xml_add_int(output, XML_ATTR_NUMUPDATES, 0); admin_input_xml = dump_xml_formatted(output); fprintf(stdout, "%s\n", crm_str(admin_input_xml)); goto bail; break; default: printf("Argument code 0%o (%c)" " is not (?yet?) supported\n", flag, flag); ++argerr; break; } } if (optind < argc) { printf("non-option ARGV-elements: "); while (optind < argc) printf("%s ", argv[optind++]); printf("\n"); crm_help('?', EX_USAGE); } if (optind > argc || cib_action == NULL) { ++argerr; } if (argerr) { crm_help('?', EX_USAGE); } if (dangerous_cmd && force_flag == FALSE) { fprintf(stderr, "The supplied command is considered dangerous." " To prevent accidental destruction of the cluster," " the --force flag is required in order to proceed.\n"); fflush(stderr); exit_code = -EINVAL; goto bail; } if (admin_input_file != NULL) { input = filename2xml(admin_input_file); source = admin_input_file; } else if (admin_input_xml != NULL) { source = "input string"; input = string2xml(admin_input_xml); } else if (admin_input_stdin) { source = "STDIN"; input = stdin2xml(); } if (input != NULL) { crm_log_xml_debug(input, "[admin input]"); } else if (source) { fprintf(stderr, "Couldn't parse input from %s.\n", source); exit_code = -EINVAL; goto bail; } if (safe_str_eq(cib_action, "md5-sum")) { char *digest = NULL; if (input == NULL) { fprintf(stderr, "Please supply XML to process with -X, -x or -p\n"); exit_code = -EINVAL; goto bail; } digest = calculate_on_disk_digest(input); fprintf(stderr, "Digest: "); fprintf(stdout, "%s\n", crm_str(digest)); free(digest); goto bail; } else if (safe_str_eq(cib_action, "md5-sum-versioned")) { char *digest = NULL; const char *version = NULL; if (input == NULL) { fprintf(stderr, "Please supply XML to process with -X, -x or -p\n"); exit_code = -EINVAL; goto bail; } version = crm_element_value(input, XML_ATTR_CRM_VERSION); digest = calculate_xml_versioned_digest(input, FALSE, TRUE, version); fprintf(stderr, "Versioned (%s) digest: ", version); fprintf(stdout, "%s\n", crm_str(digest)); free(digest); goto bail; } exit_code = do_init(); if (exit_code != pcmk_ok) { crm_err("Init failed, could not perform requested operations"); fprintf(stderr, "Init failed, could not perform requested operations\n"); return -exit_code; } exit_code = do_work(input, command_options, &output); if (exit_code > 0) { /* wait for the reply by creating a mainloop and running it until * the callbacks are invoked... */ request_id = exit_code; the_cib->cmds->register_callback(the_cib, request_id, message_timeout_ms, FALSE, NULL, "cibadmin_op_callback", cibadmin_op_callback); mainloop = g_main_new(FALSE); crm_trace("%s waiting for reply from the local CIB", crm_system_name); crm_info("Starting mainloop"); g_main_run(mainloop); } else if (exit_code < 0) { crm_err("Call failed: %s", pcmk_strerror(exit_code)); fprintf(stderr, "Call failed: %s\n", pcmk_strerror(exit_code)); operation_status = exit_code; if (exit_code == -pcmk_err_dtd_validation) { if (crm_str_eq(cib_action, CIB_OP_UPGRADE, TRUE)) { xmlNode *obj = NULL; int version = 0, rc = 0; rc = the_cib->cmds->query(the_cib, NULL, &obj, command_options); if (rc == pcmk_ok) { update_validation(&obj, &version, TRUE, FALSE); } } else if (output) { validate_xml_verbose(output); } } } if (output != NULL) { char *buffer = dump_xml_formatted(output); fprintf(stdout, "%s\n", crm_str(buffer)); free(buffer); free_xml(output); } crm_trace("%s exiting normally", crm_system_name); free_xml(input); free(admin_input_xml); free(admin_input_file); the_cib->cmds->signoff(the_cib); cib_delete(the_cib); crm_xml_cleanup(); bail: qb_log_fini(); return -exit_code; } int do_work(xmlNode * input, int call_options, xmlNode ** output) { /* construct the request */ the_cib->call_timeout = message_timeout_ms; if (strcasecmp(CIB_OP_REPLACE, cib_action) == 0 && safe_str_eq(crm_element_name(input), XML_TAG_CIB)) { xmlNode *status = get_object_root(XML_CIB_TAG_STATUS, input); if (status == NULL) { create_xml_node(input, XML_CIB_TAG_STATUS); } } if (strcasecmp(CIB_OP_SYNC, cib_action) == 0) { crm_trace("Performing %s op...", cib_action); return the_cib->cmds->sync_from(the_cib, host, obj_type, call_options); } else if (strcasecmp(CIB_OP_SLAVE, cib_action) == 0 && (call_options ^ cib_scope_local)) { crm_trace("Performing %s op on all nodes...", cib_action); return the_cib->cmds->set_slave_all(the_cib, call_options); } else if (strcasecmp(CIB_OP_MASTER, cib_action) == 0) { crm_trace("Performing %s op on all nodes...", cib_action); return the_cib->cmds->set_master(the_cib, call_options); } else if (cib_action != NULL) { crm_trace("Passing \"%s\" to variant_op...", cib_action); return cib_internal_op(the_cib, cib_action, host, obj_type, input, output, call_options, NULL); } else { crm_err("You must specify an operation"); } return -EINVAL; } int do_init(void) { int rc = pcmk_ok; the_cib = cib_new(); rc = the_cib->cmds->signon(the_cib, crm_system_name, cib_command); if (rc != pcmk_ok) { crm_err("Signon to CIB failed: %s", pcmk_strerror(rc)); fprintf(stderr, "Signon to CIB failed: %s\n", pcmk_strerror(rc)); } return rc; } void cib_connection_destroy(gpointer user_data) { crm_err("Connection to the CIB terminated... exiting"); g_main_quit(mainloop); return; } void cibadmin_op_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { char *admin_input_xml = NULL; exit_code = rc; if (output != NULL) { admin_input_xml = dump_xml_formatted(output); } if (safe_str_eq(cib_action, CIB_OP_ISMASTER) && rc != pcmk_ok) { crm_info("CIB on %s is _not_ the master instance", host ? host : "localhost"); fprintf(stderr, "CIB on %s is _not_ the master instance\n", host ? host : "localhost"); } else if (safe_str_eq(cib_action, CIB_OP_ISMASTER)) { crm_info("CIB on %s _is_ the master instance", host ? host : "localhost"); fprintf(stderr, "CIB on %s _is_ the master instance\n", host ? host : "localhost"); } else if (rc != 0) { crm_warn("Call %s failed (%d): %s", cib_action, rc, pcmk_strerror(rc)); fprintf(stderr, "Call %s failed (%d): %s\n", cib_action, rc, pcmk_strerror(rc)); fprintf(stdout, "%s\n", crm_str(admin_input_xml)); } else if (safe_str_eq(cib_action, CIB_OP_QUERY) && output == NULL) { crm_err("Output expected in query response"); crm_log_xml_err(msg, "no output"); } else if (output == NULL) { crm_info("Call passed"); } else { crm_info("Call passed"); fprintf(stdout, "%s\n", crm_str(admin_input_xml)); } free(admin_input_xml); if (call_id == request_id) { g_main_quit(mainloop); } else { crm_info("Message was not the response we were looking for (%d vs. %d", call_id, request_id); } } diff --git a/tools/crm_mon.c b/tools/crm_mon.c index 7bcc187a70..542b0baaa5 100644 --- a/tools/crm_mon.c +++ b/tools/crm_mon.c @@ -1,2287 +1,2290 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include <../lib/pengine/unpack.h> #include <../pengine/pengine.h> #include /* GMainLoop *mainloop = NULL; */ void wait_for_refresh(int offset, const char *prefix, int msec); void clean_up(int rc); void crm_diff_update(const char *event, xmlNode * msg); gboolean mon_refresh_display(gpointer user_data); int cib_connect(gboolean full); void mon_st_callback(stonith_t *st, stonith_event_t *e); char *xml_file = NULL; char *as_html_file = NULL; int as_xml = 0; char *pid_file = NULL; char *snmp_target = NULL; char *snmp_community = NULL; gboolean as_console = TRUE;; gboolean simple_status = FALSE; gboolean group_by_node = FALSE; gboolean inactive_resources = FALSE; gboolean web_cgi = FALSE; int reconnect_msec = 5000; gboolean daemonize = FALSE; GMainLoop *mainloop = NULL; guint timer_id = 0; GList *attr_list = NULL; const char *crm_mail_host = NULL; const char *crm_mail_prefix = NULL; const char *crm_mail_from = NULL; const char *crm_mail_to = NULL; const char *external_agent = NULL; const char *external_recipient = NULL; cib_t *cib = NULL; stonith_t *st = NULL; xmlNode *current_cib = NULL; gboolean one_shot = FALSE; gboolean has_warnings = FALSE; gboolean print_failcount = FALSE; gboolean print_operations = FALSE; gboolean print_timing = FALSE; gboolean print_nodes_attr = FALSE; gboolean print_last_updated = TRUE; gboolean print_last_change = TRUE; gboolean print_tickets = FALSE; gboolean watch_fencing = FALSE; #define FILTER_STR {"shutdown", "terminate", "standby", "fail-count", \ "last-failure", "probe_complete", "#id", "#uname", \ "#is_dc", NULL} gboolean log_diffs = FALSE; gboolean log_updates = FALSE; long last_refresh = 0; crm_trigger_t *refresh_trigger = NULL; /* * 1.3.6.1.4.1.32723 has been assigned to the project by IANA * http://www.iana.org/assignments/enterprise-numbers */ #define PACEMAKER_PREFIX "1.3.6.1.4.1.32723" #define PACEMAKER_TRAP_PREFIX PACEMAKER_PREFIX ".1" #define snmp_crm_trap_oid PACEMAKER_TRAP_PREFIX #define snmp_crm_oid_node PACEMAKER_TRAP_PREFIX ".1" #define snmp_crm_oid_rsc PACEMAKER_TRAP_PREFIX ".2" #define snmp_crm_oid_task PACEMAKER_TRAP_PREFIX ".3" #define snmp_crm_oid_desc PACEMAKER_TRAP_PREFIX ".4" #define snmp_crm_oid_status PACEMAKER_TRAP_PREFIX ".5" #define snmp_crm_oid_rc PACEMAKER_TRAP_PREFIX ".6" #define snmp_crm_oid_trc PACEMAKER_TRAP_PREFIX ".7" #if CURSES_ENABLED # define print_dot() if(as_console) { \ printw("."); \ clrtoeol(); \ refresh(); \ } else { \ fprintf(stdout, "."); \ } #else # define print_dot() fprintf(stdout, "."); #endif #if CURSES_ENABLED # define print_as(fmt, args...) if(as_console) { \ printw(fmt, ##args); \ clrtoeol(); \ refresh(); \ } else { \ fprintf(stdout, fmt, ##args); \ } #else # define print_as(fmt, args...) fprintf(stdout, fmt, ##args); #endif static void blank_screen(void) { #if CURSES_ENABLED int lpc = 0; for (lpc = 0; lpc < LINES; lpc++) { move(lpc, 0); clrtoeol(); } move(0, 0); refresh(); #endif } static gboolean mon_timer_popped(gpointer data) { int rc = pcmk_ok; if (timer_id > 0) { g_source_remove(timer_id); } rc = cib_connect(TRUE); if (rc != pcmk_ok) { print_dot(); timer_id = g_timeout_add(reconnect_msec, mon_timer_popped, NULL); } return FALSE; } static void mon_cib_connection_destroy(gpointer user_data) { print_as("Connection to the CIB terminated\n"); if (cib) { print_as("Reconnecting..."); cib->cmds->signoff(cib); timer_id = g_timeout_add(reconnect_msec, mon_timer_popped, NULL); } return; } /* * Mainloop signal handler. */ static void mon_shutdown(int nsig) { clean_up(EX_OK); } #if ON_DARWIN # define sighandler_t sig_t #endif #if CURSES_ENABLED +#ifndef HAVE_SIGHANDLER_T +typedef void (*sighandler_t)(int); +#endif static sighandler_t ncurses_winch_handler; static void mon_winresize(int nsig) { static int not_done; int lines = 0, cols = 0; if (!not_done++) { if (ncurses_winch_handler) /* the original ncurses WINCH signal handler does the * magic of retrieving the new window size; * otherwise, we'd have to use ioctl or tgetent */ (*ncurses_winch_handler) (SIGWINCH); getmaxyx(stdscr, lines, cols); resizeterm(lines, cols); mainloop_set_trigger(refresh_trigger); } not_done--; } #endif int cib_connect(gboolean full) { int rc = pcmk_ok; static gboolean need_pass = TRUE; CRM_CHECK(cib != NULL, return -EINVAL); if (getenv("CIB_passwd") != NULL) { need_pass = FALSE; } if(watch_fencing && st == NULL) { st = stonith_api_new(); } if(watch_fencing && st->state == stonith_disconnected) { crm_trace("Connecting to stonith"); rc = st->cmds->connect(st, crm_system_name, NULL); if(rc == pcmk_ok) { crm_trace("Setting up stonith callbacks"); st->cmds->register_notification(st, T_STONITH_NOTIFY_FENCE, mon_st_callback); } } if (cib->state != cib_connected_query && cib->state != cib_connected_command) { crm_trace("Connecting to the CIB"); if (as_console && need_pass && cib->variant == cib_remote) { need_pass = FALSE; print_as("Password:"); } rc = cib->cmds->signon(cib, crm_system_name, cib_query); if (rc != pcmk_ok) { return rc; } current_cib = get_cib_copy(cib); mon_refresh_display(NULL); if (full) { if (rc == pcmk_ok) { rc = cib->cmds->set_connection_dnotify(cib, mon_cib_connection_destroy); if (rc == -EPROTONOSUPPORT) { print_as("Notification setup failed, won't be able to reconnect after failure"); if (as_console) { sleep(2); } rc = pcmk_ok; } } if (rc == pcmk_ok) { cib->cmds->del_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); rc = cib->cmds->add_notify_callback(cib, T_CIB_DIFF_NOTIFY, crm_diff_update); } if (rc != pcmk_ok) { print_as("Notification setup failed, could not monitor CIB actions"); if (as_console) { sleep(2); } clean_up(-rc); } } } return rc; } /* *INDENT-OFF* */ static struct crm_option long_options[] = { /* Top-level Options */ {"help", 0, 0, '?', "\tThis text"}, {"version", 0, 0, '$', "\tVersion information" }, {"verbose", 0, 0, 'V', "\tIncrease debug output"}, {"quiet", 0, 0, 'Q', "\tDisplay only essential output" }, {"-spacer-", 1, 0, '-', "\nModes:"}, {"as-html", 1, 0, 'h', "Write cluster status to the named html file"}, {"as-xml", 0, 0, 'X', "\tWrite cluster status as xml to stdout. This will enable one-shot mode."}, {"web-cgi", 0, 0, 'w', "\tWeb mode with output suitable for cgi"}, {"simple-status", 0, 0, 's', "Display the cluster status once as a simple one line output (suitable for nagios)"}, {"snmp-traps", 1, 0, 'S', "Send SNMP traps to this station", !ENABLE_SNMP}, {"snmp-community", 1, 0, 'C', "Specify community for SNMP traps(default is NULL)", !ENABLE_SNMP}, {"mail-to", 1, 0, 'T', "Send Mail alerts to this user. See also --mail-from, --mail-host, --mail-prefix", !ENABLE_ESMTP}, {"-spacer-", 1, 0, '-', "\nDisplay Options:"}, {"group-by-node", 0, 0, 'n', "\tGroup resources by node" }, {"inactive", 0, 0, 'r', "\tDisplay inactive resources" }, {"failcounts", 0, 0, 'f', "\tDisplay resource fail counts"}, {"operations", 0, 0, 'o', "\tDisplay resource operation history" }, {"timing-details", 0, 0, 't', "\tDisplay resource operation history with timing details" }, {"tickets", 0, 0, 'c', "\t\tDisplay cluster tickets"}, {"watch-fencing", 0, 0, 'W', "\t\tListen for fencing events. For use with --external-agent, --mail-to and/or --snmp-traps where supported"}, {"show-node-attributes", 0, 0, 'A', "Display node attributes" }, {"-spacer-", 1, 0, '-', "\nAdditional Options:"}, {"interval", 1, 0, 'i', "\tUpdate frequency in seconds" }, {"one-shot", 0, 0, '1', "\tDisplay the cluster status once on the console and exit"}, {"disable-ncurses",0, 0, 'N', "\tDisable the use of ncurses", !CURSES_ENABLED}, {"daemonize", 0, 0, 'd', "\tRun in the background as a daemon"}, {"pid-file", 1, 0, 'p', "\t(Advanced) Daemon pid file location"}, {"mail-from", 1, 0, 'F', "\tMail alerts should come from the named user", !ENABLE_ESMTP}, {"mail-host", 1, 0, 'H', "\tMail alerts should be sent via the named host", !ENABLE_ESMTP}, {"mail-prefix", 1, 0, 'P', "Subjects for mail alerts should start with this string", !ENABLE_ESMTP}, {"external-agent", 1, 0, 'E', "A program to run when resource operations take place."}, {"external-recipient",1, 0, 'e', "A recipient for your program (assuming you want the program to send something to someone)."}, {"xml-file", 1, 0, 'x', NULL, 1}, {"-spacer-", 1, 0, '-', "\nExamples:", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', "Display the cluster status on the console with updates as they occur:", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_mon", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Display the cluster status on the console just once then exit:", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_mon -1", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Display your cluster status, group resources by node, and include inactive resources in the list:", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_mon --group-by-node --inactive", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Start crm_mon as a background daemon and have it write the cluster status to an HTML file:", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_mon --daemonize --as-html /path/to/docroot/filename.html", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Start crm_mon and export the current cluster status as xml to stdout, then exit.:", pcmk_option_paragraph}, {"-spacer-", 1, 0, '-', " crm_mon --as-xml", pcmk_option_example}, {"-spacer-", 1, 0, '-', "Start crm_mon as a background daemon and have it send email alerts:", pcmk_option_paragraph|!ENABLE_ESMTP}, {"-spacer-", 1, 0, '-', " crm_mon --daemonize --mail-to user@example.com --mail-host mail.example.com", pcmk_option_example|!ENABLE_ESMTP}, {"-spacer-", 1, 0, '-', "Start crm_mon as a background daemon and have it send SNMP alerts:", pcmk_option_paragraph|!ENABLE_SNMP}, {"-spacer-", 1, 0, '-', " crm_mon --daemonize --snmp-traps snmptrapd.example.com", pcmk_option_example|!ENABLE_SNMP}, {NULL, 0, 0, 0} }; /* *INDENT-ON* */ int main(int argc, char **argv) { int flag; int argerr = 0; int exit_code = 0; int option_index = 0; pid_file = strdup("/tmp/ClusterMon.pid"); crm_log_cli_init("crm_mon"); crm_set_options(NULL, "mode [options]", long_options, "Provides a summary of cluster's current state." "\n\nOutputs varying levels of detail in a number of different formats.\n"); #ifndef ON_DARWIN /* prevent zombies */ signal(SIGCLD, SIG_IGN); #endif if (strcmp(crm_system_name, "crm_mon.cgi") == 0) { web_cgi = TRUE; one_shot = TRUE; } while (1) { flag = crm_get_option(argc, argv, &option_index); if (flag == -1) break; switch (flag) { case 'V': crm_bump_log_level(argc, argv); break; case 'Q': print_last_updated = FALSE; print_last_change = FALSE; break; case 'i': reconnect_msec = crm_get_msec(optarg); break; case 'n': group_by_node = TRUE; break; case 'r': inactive_resources = TRUE; break; case 'W': watch_fencing = TRUE; break; case 'd': daemonize = TRUE; break; case 't': print_timing = TRUE; print_operations = TRUE; break; case 'o': print_operations = TRUE; break; case 'f': print_failcount = TRUE; break; case 'A': print_nodes_attr = TRUE; break; case 'c': print_tickets = TRUE; break; case 'p': free(pid_file); pid_file = strdup(optarg); break; case 'x': xml_file = strdup(optarg); one_shot = TRUE; break; case 'h': as_html_file = strdup(optarg); break; case 'X': as_xml = TRUE; one_shot = TRUE; break; case 'w': web_cgi = TRUE; one_shot = TRUE; break; case 's': simple_status = TRUE; one_shot = TRUE; break; case 'S': snmp_target = optarg; break; case 'T': crm_mail_to = optarg; break; case 'F': crm_mail_from = optarg; break; case 'H': crm_mail_host = optarg; break; case 'P': crm_mail_prefix = optarg; break; case 'E': external_agent = optarg; break; case 'e': external_recipient = optarg; break; case '1': one_shot = TRUE; break; case 'N': as_console = FALSE; break; case 'C': snmp_community = optarg; break; case '$': case '?': crm_help(flag, EX_OK); break; default: printf("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag); ++argerr; break; } } if (optind < argc) { printf("non-option ARGV-elements: "); while (optind < argc) printf("%s ", argv[optind++]); printf("\n"); } if (argerr) { crm_help('?', EX_USAGE); } if (one_shot) { as_console = FALSE; } else if (daemonize) { as_console = FALSE; crm_enable_stderr(FALSE); if (!as_html_file && !snmp_target && !crm_mail_to && !external_agent && !as_xml) { printf ("Looks like you forgot to specify one or more of: --as-html, --as-xml, --mail-to, --snmp-target, --external-agent\n"); crm_help('?', EX_USAGE); } crm_make_daemon(crm_system_name, TRUE, pid_file); } else if (as_console) { #if CURSES_ENABLED initscr(); cbreak(); noecho(); crm_enable_stderr(FALSE); #else one_shot = TRUE; as_console = FALSE; printf("Defaulting to one-shot mode\n"); printf("You need to have curses available at compile time to enable console mode\n"); #endif } crm_info("Starting %s", crm_system_name); if (xml_file != NULL) { current_cib = filename2xml(xml_file); mon_refresh_display(NULL); return exit_code; } if (current_cib == NULL) { cib = cib_new(); if (!one_shot) { print_as("Attempting connection to the cluster..."); } do { exit_code = cib_connect(!one_shot); if (one_shot) { break; } else if (exit_code != pcmk_ok) { print_dot(); sleep(reconnect_msec / 1000); } } while (exit_code == -ENOTCONN); if (exit_code != pcmk_ok) { print_as("\nConnection to cluster failed: %s\n", pcmk_strerror(exit_code)); if (as_console) { sleep(2); } clean_up(-exit_code); } } if (one_shot) { return exit_code; } mainloop = g_main_new(FALSE); mainloop_add_signal(SIGTERM, mon_shutdown); mainloop_add_signal(SIGINT, mon_shutdown); #if CURSES_ENABLED if (as_console) { ncurses_winch_handler = signal(SIGWINCH, mon_winresize); if (ncurses_winch_handler == SIG_DFL || ncurses_winch_handler == SIG_IGN || ncurses_winch_handler == SIG_ERR) ncurses_winch_handler = NULL; } #endif refresh_trigger = mainloop_add_trigger(G_PRIORITY_LOW, mon_refresh_display, NULL); g_main_run(mainloop); g_main_destroy(mainloop); crm_info("Exiting %s", crm_system_name); clean_up(0); return 0; /* never reached */ } void wait_for_refresh(int offset, const char *prefix, int msec) { int lpc = msec / 1000; struct timespec sleept = { 1, 0 }; if (as_console == FALSE) { timer_id = g_timeout_add(msec, mon_timer_popped, NULL); return; } crm_notice("%sRefresh in %ds...", prefix ? prefix : "", lpc); while (lpc > 0) { #if CURSES_ENABLED move(offset, 0); /* printw("%sRefresh in \033[01;32m%ds\033[00m...", prefix?prefix:"", lpc); */ printw("%sRefresh in %ds...\n", prefix ? prefix : "", lpc); clrtoeol(); refresh(); #endif lpc--; if (lpc == 0) { timer_id = g_timeout_add(1000, mon_timer_popped, NULL); } else { if (nanosleep(&sleept, NULL) != 0) { return; } } } } #define mon_warn(fmt...) do { \ if (!has_warnings) { \ print_as("Warning:"); \ } else { \ print_as(","); \ } \ print_as(fmt); \ has_warnings = TRUE; \ } while(0) static int count_resources(pe_working_set_t * data_set, resource_t * rsc) { int count = 0; GListPtr gIter = NULL; if (rsc == NULL) { gIter = data_set->resources; } else if (rsc->children) { gIter = rsc->children; } else { return is_not_set(rsc->flags, pe_rsc_orphan); } for (; gIter != NULL; gIter = gIter->next) { count += count_resources(data_set, gIter->data); } return count; } static int print_simple_status(pe_working_set_t * data_set) { node_t *dc = NULL; GListPtr gIter = NULL; int nodes_online = 0; int nodes_standby = 0; dc = data_set->dc_node; if (dc == NULL) { mon_warn("No DC "); } for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { node_t *node = (node_t *) gIter->data; if (node->details->standby && node->details->online) { nodes_standby++; } else if (node->details->online) { nodes_online++; } else { mon_warn("offline node: %s", node->details->uname); } } if (!has_warnings) { print_as("Ok: %d nodes online", nodes_online); if (nodes_standby > 0) { print_as(", %d standby nodes", nodes_standby); } print_as(", %d resources configured", count_resources(data_set, NULL)); } print_as("\n"); return 0; } extern int get_failcount(node_t * node, resource_t * rsc, int *last_failure, pe_working_set_t * data_set); static void print_date(time_t time) { int lpc = 0; char date_str[26]; asctime_r(localtime(&time), date_str); for (; lpc < 26; lpc++) { if (date_str[lpc] == '\n') { date_str[lpc] = 0; } } print_as("'%s'", date_str); } static void print_rsc_summary(pe_working_set_t * data_set, node_t * node, resource_t * rsc, gboolean all) { gboolean printed = FALSE; time_t last_failure = 0; char *fail_attr = crm_concat("fail-count", rsc->id, '-'); const char *value = g_hash_table_lookup(node->details->attrs, fail_attr); int failcount = char2score(value); /* Get the true value, not the effective one from get_failcount() */ get_failcount(node, rsc, (int *)&last_failure, data_set); free(fail_attr); if (all || failcount || last_failure > 0) { printed = TRUE; print_as(" %s: migration-threshold=%d", rsc->id, rsc->migration_threshold); } if (failcount > 0) { printed = TRUE; print_as(" fail-count=%d", failcount); } if (last_failure > 0) { printed = TRUE; print_as(" last-failure="); print_date(last_failure); } if (printed) { print_as("\n"); } } static void print_rsc_history(pe_working_set_t * data_set, node_t * node, xmlNode * rsc_entry) { GListPtr gIter = NULL; GListPtr op_list = NULL; gboolean print_name = TRUE; GListPtr sorted_op_list = NULL; const char *rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID); resource_t *rsc = pe_find_resource(data_set->resources, rsc_id); xmlNode *rsc_op = NULL; for (rsc_op = __xml_first_child(rsc_entry); rsc_op != NULL; rsc_op = __xml_next(rsc_op)) { if (crm_str_eq((const char *)rsc_op->name, XML_LRM_TAG_RSC_OP, TRUE)) { op_list = g_list_append(op_list, rsc_op); } } sorted_op_list = g_list_sort(op_list, sort_op_by_callid); for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) { xmlNode *xml_op = (xmlNode *) gIter->data; const char *value = NULL; const char *call = crm_element_value(xml_op, XML_LRM_ATTR_CALLID); const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); const char *op_rc = crm_element_value(xml_op, XML_LRM_ATTR_RC); const char *interval = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL); int rc = crm_parse_int(op_rc, "0"); if (safe_str_eq(task, CRMD_ACTION_STATUS) && safe_str_eq(interval, "0")) { task = "probe"; } if (rc == 7 && safe_str_eq(task, "probe")) { continue; } else if (safe_str_eq(task, CRMD_ACTION_NOTIFY)) { continue; } if (print_name) { print_name = FALSE; if (rsc == NULL) { print_as("Orphan resource: %s", rsc_id); } else { print_rsc_summary(data_set, node, rsc, TRUE); } } print_as(" + (%s) %s:", call, task); if (safe_str_neq(interval, "0")) { print_as(" interval=%sms", interval); } if (print_timing) { int int_value; const char *attr = "last-rc-change"; value = crm_element_value(xml_op, attr); if (value) { int_value = crm_parse_int(value, NULL); print_as(" %s=", attr); print_date(int_value); } attr = "last-run"; value = crm_element_value(xml_op, attr); if (value) { int_value = crm_parse_int(value, NULL); print_as(" %s=", attr); print_date(int_value); } attr = "exec-time"; value = crm_element_value(xml_op, attr); if (value) { int_value = crm_parse_int(value, NULL); print_as(" %s=%dms", attr, int_value); } attr = "queue-time"; value = crm_element_value(xml_op, attr); if (value) { int_value = crm_parse_int(value, NULL); print_as(" %s=%dms", attr, int_value); } } print_as(" rc=%s (%s)\n", op_rc, lrmd_event_rc2str(rc)); } /* no need to free the contents */ g_list_free(sorted_op_list); } static void print_attr_msg(node_t * node, GListPtr rsc_list, const char *attrname, const char *attrvalue) { GListPtr gIter = NULL; for (gIter = rsc_list; gIter != NULL; gIter = gIter->next) { resource_t *rsc = (resource_t *) gIter->data; const char *type = g_hash_table_lookup(rsc->meta, "type"); if (rsc->children != NULL) { print_attr_msg(node, rsc->children, attrname, attrvalue); } if (safe_str_eq(type, "ping") || safe_str_eq(type, "pingd")) { const char *name = "pingd"; const char *multiplier = NULL; char **host_list = NULL; int host_list_num = 0; int expected_score = 0; if (g_hash_table_lookup(rsc->meta, "name") != NULL) { name = g_hash_table_lookup(rsc->meta, "name"); } /* To identify the resource with the attribute name. */ if (safe_str_eq(name, attrname)) { int value = crm_parse_int(attrvalue, "0"); multiplier = g_hash_table_lookup(rsc->meta, "multiplier"); host_list = g_strsplit(g_hash_table_lookup(rsc->meta, "host_list"), " ", 0); host_list_num = g_strv_length(host_list); g_strfreev(host_list); /* pingd multiplier is the same as the default value. */ expected_score = host_list_num * crm_parse_int(multiplier, "1"); /* pingd is abnormal score. */ if (value <= 0) { print_as("\t: Connectivity is lost"); } else if (value < expected_score) { print_as("\t: Connectivity is degraded (Expected=%d)", expected_score); } } } } } static int compare_attribute(gconstpointer a, gconstpointer b) { int rc; rc = strcmp((const char *)a, (const char *)b); return rc; } static void create_attr_list(gpointer name, gpointer value, gpointer data) { int i; const char *filt_str[] = FILTER_STR; CRM_CHECK(name != NULL, return); /* filtering automatic attributes */ for (i = 0; filt_str[i] != NULL; i++) { if (g_str_has_prefix(name, filt_str[i])) { return; } } attr_list = g_list_insert_sorted(attr_list, name, compare_attribute); } static void print_node_attribute(gpointer name, gpointer node_data) { const char *value = NULL; node_t *node = (node_t *) node_data; value = g_hash_table_lookup(node->details->attrs, name); print_as(" + %-32s\t: %-10s", (char *)name, value); print_attr_msg(node, node->details->running_rsc, name, value); print_as("\n"); } static void print_node_summary(pe_working_set_t * data_set, gboolean operations) { xmlNode *lrm_rsc = NULL; xmlNode *rsc_entry = NULL; xmlNode *node_state = NULL; xmlNode *cib_status = get_object_root(XML_CIB_TAG_STATUS, data_set->input); if (operations) { print_as("\nOperations:\n"); } else { print_as("\nMigration summary:\n"); } for (node_state = __xml_first_child(cib_status); node_state != NULL; node_state = __xml_next(node_state)) { if (crm_str_eq((const char *)node_state->name, XML_CIB_TAG_STATE, TRUE)) { node_t *node = pe_find_node_id(data_set->nodes, ID(node_state)); if (node == NULL || node->details->online == FALSE) { continue; } print_as("* Node %s: ", crm_element_value(node_state, XML_ATTR_UNAME)); print_as("\n"); lrm_rsc = find_xml_node(node_state, XML_CIB_TAG_LRM, FALSE); lrm_rsc = find_xml_node(lrm_rsc, XML_LRM_TAG_RESOURCES, FALSE); for (rsc_entry = __xml_first_child(lrm_rsc); rsc_entry != NULL; rsc_entry = __xml_next(rsc_entry)) { if (crm_str_eq((const char *)rsc_entry->name, XML_LRM_TAG_RESOURCE, TRUE)) { if (operations) { print_rsc_history(data_set, node, rsc_entry); } else { const char *rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID); resource_t *rsc = pe_find_resource(data_set->resources, rsc_id); if (rsc) { print_rsc_summary(data_set, node, rsc, FALSE); } else { print_as(" %s: orphan\n", rsc_id); } } } } } } } static void print_ticket(gpointer name, gpointer value, gpointer data) { ticket_t *ticket = (ticket_t *) value; print_as(" %s\t%s%10s", ticket->id, ticket->granted ? "granted":"revoked", ticket->standby ? " [standby]":""); if (ticket->last_granted > -1) { print_as(" last-granted="); print_date(ticket->last_granted); } print_as("\n"); return; } static void print_cluster_tickets(pe_working_set_t * data_set) { xmlNode *cib_constraints = get_object_root(XML_CIB_TAG_CONSTRAINTS, data_set->input); /* For recording the tickets that are referenced in rsc_ticket constraints * but have never been granted yet. */ unpack_constraints(cib_constraints, data_set); print_as("\nTickets:\n"); g_hash_table_foreach(data_set->tickets, print_ticket, NULL); return; } static int print_status(pe_working_set_t * data_set) { static int updates = 0; GListPtr gIter = NULL; node_t *dc = NULL; char *since_epoch = NULL; char *online_nodes = NULL; char *offline_nodes = NULL; xmlNode *dc_version = NULL; xmlNode *quorum_node = NULL; xmlNode *stack = NULL; time_t a_time = time(NULL); int print_opts = pe_print_ncurses; const char *quorum_votes = "unknown"; if (as_console) { blank_screen(); } else { print_opts = pe_print_printf; } updates++; dc = data_set->dc_node; if (a_time == (time_t) - 1) { crm_perror(LOG_ERR, "set_node_tstamp(): Invalid time returned"); return 1; } since_epoch = ctime(&a_time); if (since_epoch != NULL && print_last_updated) { print_as("Last updated: %s", since_epoch); } if (print_last_change) { const char *last_written = crm_element_value(data_set->input, XML_CIB_ATTR_WRITTEN); const char *user = crm_element_value(data_set->input, XML_ATTR_UPDATE_USER); const char *client = crm_element_value(data_set->input, XML_ATTR_UPDATE_CLIENT); const char *origin = crm_element_value(data_set->input, XML_ATTR_UPDATE_ORIG); print_as("Last change: %s", last_written ? last_written : ""); if (user) { print_as(" by %s", user); } if (client) { print_as(" via %s", client); } if (origin) { print_as(" on %s", origin); } print_as("\n"); } stack = get_xpath_object("//nvpair[@name='cluster-infrastructure']", data_set->input, LOG_DEBUG); if (stack) { print_as("Stack: %s\n", crm_element_value(stack, XML_NVPAIR_ATTR_VALUE)); } dc_version = get_xpath_object("//nvpair[@name='dc-version']", data_set->input, LOG_DEBUG); if (dc == NULL) { print_as("Current DC: NONE\n"); } else { const char *quorum = crm_element_value(data_set->input, XML_ATTR_HAVE_QUORUM); if (safe_str_neq(dc->details->uname, dc->details->id)) { print_as("Current DC: %s (%s)", dc->details->uname, dc->details->id); } else { print_as("Current DC: %s", dc->details->uname); } print_as(" - partition %s quorum\n", crm_is_true(quorum) ? "with" : "WITHOUT"); if (dc_version) { print_as("Version: %s\n", crm_element_value(dc_version, XML_NVPAIR_ATTR_VALUE)); } } quorum_node = get_xpath_object("//nvpair[@name='" XML_ATTR_EXPECTED_VOTES "']", data_set->input, LOG_DEBUG); if (quorum_node) { quorum_votes = crm_element_value(quorum_node, XML_NVPAIR_ATTR_VALUE); } print_as("%d Nodes configured, %s expected votes\n", g_list_length(data_set->nodes), quorum_votes); print_as("%d Resources configured.\n", count_resources(data_set, NULL)); print_as("\n\n"); for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { node_t *node = (node_t *) gIter->data; const char *node_mode = NULL; if (node->details->unclean) { if (node->details->online && node->details->unclean) { node_mode = "UNCLEAN (online)"; } else if (node->details->pending) { node_mode = "UNCLEAN (pending)"; } else { node_mode = "UNCLEAN (offline)"; } } else if (node->details->pending) { node_mode = "pending"; } else if (node->details->standby_onfail && node->details->online) { node_mode = "standby (on-fail)"; } else if (node->details->standby) { if (node->details->online) { node_mode = "standby"; } else { node_mode = "OFFLINE (standby)"; } } else if (node->details->online) { node_mode = "online"; if (group_by_node == FALSE) { online_nodes = add_list_element(online_nodes, node->details->uname); continue; } } else { node_mode = "OFFLINE"; if (group_by_node == FALSE) { offline_nodes = add_list_element(offline_nodes, node->details->uname); continue; } } if (safe_str_eq(node->details->uname, node->details->id)) { print_as("Node %s: %s\n", node->details->uname, node_mode); } else { print_as("Node %s (%s): %s\n", node->details->uname, node->details->id, node_mode); } if (group_by_node) { GListPtr gIter2 = NULL; for (gIter2 = node->details->running_rsc; gIter2 != NULL; gIter2 = gIter2->next) { resource_t *rsc = (resource_t *) gIter2->data; rsc->fns->print(rsc, "\t", print_opts | pe_print_rsconly, stdout); } } } if (online_nodes) { print_as("Online: [%s ]\n", online_nodes); free(online_nodes); } if (offline_nodes) { print_as("OFFLINE: [%s ]\n", offline_nodes); free(offline_nodes); } if (group_by_node == FALSE && inactive_resources) { print_as("\nFull list of resources:\n"); } else if (inactive_resources) { print_as("\nInactive resources:\n"); } if (group_by_node == FALSE || inactive_resources) { print_as("\n"); for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { resource_t *rsc = (resource_t *) gIter->data; gboolean is_active = rsc->fns->active(rsc, TRUE); gboolean partially_active = rsc->fns->active(rsc, FALSE); if (is_set(rsc->flags, pe_rsc_orphan) && is_active == FALSE) { continue; } else if (group_by_node == FALSE) { if (partially_active || inactive_resources) { rsc->fns->print(rsc, NULL, print_opts, stdout); } } else if (is_active == FALSE && inactive_resources) { rsc->fns->print(rsc, NULL, print_opts, stdout); } } } if (print_nodes_attr) { print_as("\nNode Attributes:\n"); for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { node_t *node = (node_t *) gIter->data; if (node == NULL || node->details->online == FALSE) { continue; } attr_list = NULL; print_as("* Node %s:\n", node->details->uname); g_hash_table_foreach(node->details->attrs, create_attr_list, NULL); g_list_foreach(attr_list, print_node_attribute, node); } } if (print_operations || print_failcount) { print_node_summary(data_set, print_operations); } if (xml_has_children(data_set->failed)) { xmlNode *xml_op = NULL; print_as("\nFailed actions:\n"); for (xml_op = __xml_first_child(data_set->failed); xml_op != NULL; xml_op = __xml_next(xml_op)) { int val = 0; const char *id = ID(xml_op); const char *op_key = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); const char *last = crm_element_value(xml_op, "last_run"); const char *node = crm_element_value(xml_op, XML_ATTR_UNAME); const char *call = crm_element_value(xml_op, XML_LRM_ATTR_CALLID); const char *rc = crm_element_value(xml_op, XML_LRM_ATTR_RC); const char *status = crm_element_value(xml_op, XML_LRM_ATTR_OPSTATUS); val = crm_parse_int(status, "0"); print_as(" %s (node=%s, call=%s, rc=%s, status=%s", op_key ? op_key : id, node, call, rc, services_lrm_status_str(val)); if (last) { time_t run_at = crm_parse_int(last, "0"); print_as(", last-run=%s, queued=%sms, exec=%sms\n", ctime(&run_at), crm_element_value(xml_op, "exec_time"), crm_element_value(xml_op, "queue_time")); } val = crm_parse_int(rc, "0"); print_as("): %s\n", lrmd_event_rc2str(val)); } } if (print_tickets) { print_cluster_tickets(data_set); } #if CURSES_ENABLED if (as_console) { refresh(); } #endif return 0; } static int print_xml_status(pe_working_set_t * data_set) { FILE *stream = stdout; GListPtr gIter = NULL; node_t *dc = NULL; xmlNode *stack = NULL; xmlNode *quorum_node = NULL; const char *quorum_votes = "unknown"; dc = data_set->dc_node; fprintf(stream, "\n"); fprintf(stream, "\n", VERSION); /*** SUMMARY ***/ fprintf(stream, " \n"); if (print_last_updated) { time_t now = time(NULL); char *now_str = ctime(&now); now_str[24] = EOS; /* replace the newline */ fprintf(stream, " \n", now_str); } if (print_last_change) { const char *last_written = crm_element_value(data_set->input, XML_CIB_ATTR_WRITTEN); const char *user = crm_element_value(data_set->input, XML_ATTR_UPDATE_USER); const char *client = crm_element_value(data_set->input, XML_ATTR_UPDATE_CLIENT); const char *origin = crm_element_value(data_set->input, XML_ATTR_UPDATE_ORIG); fprintf(stream, " \n", last_written ? last_written : "", user ? user : "", client ? client : "", origin ? origin : ""); } stack = get_xpath_object("//nvpair[@name='cluster-infrastructure']", data_set->input, LOG_DEBUG); if (stack) { fprintf(stream, " \n", crm_element_value(stack, XML_NVPAIR_ATTR_VALUE)); } if (!dc) { fprintf(stream, " \n"); } else { const char *quorum = crm_element_value(data_set->input, XML_ATTR_HAVE_QUORUM); const char *uname = dc->details->uname; const char *id = dc->details->id; xmlNode *dc_version = get_xpath_object("//nvpair[@name='dc-version']", data_set->input, LOG_DEBUG); fprintf(stream, " \n", dc_version ? crm_element_value(dc_version, XML_NVPAIR_ATTR_VALUE) : "", uname, id, quorum ? (crm_is_true(quorum) ? "true" : "false") : "false"); } quorum_node = get_xpath_object("//nvpair[@name='" XML_ATTR_EXPECTED_VOTES "']", data_set->input, LOG_DEBUG); if (quorum_node) { quorum_votes = crm_element_value(quorum_node, XML_NVPAIR_ATTR_VALUE); } fprintf(stream, " \n", g_list_length(data_set->nodes), quorum_votes); fprintf(stream, " \n", count_resources(data_set, NULL)); fprintf(stream, " \n"); /*** NODES ***/ fprintf(stream, " \n"); for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { node_t *node = (node_t *) gIter->data; const char *node_type = "unknown"; switch (node->details->type) { case node_member: node_type = "member"; break; case node_ping: node_type = "ping"; break; } fprintf(stream, " details->uname); fprintf(stream, "id=\"%s\" ", node->details->id); fprintf(stream, "online=\"%s\" ", node->details->online ? "true" : "false"); fprintf(stream, "standby=\"%s\" ", node->details->standby ? "true" : "false"); fprintf(stream, "standby_onfail=\"%s\" ", node->details->standby_onfail ? "true" : "false"); fprintf(stream, "pending=\"%s\" ", node->details->pending ? "true" : "false"); fprintf(stream, "unclean=\"%s\" ", node->details->unclean ? "true" : "false"); fprintf(stream, "shutdown=\"%s\" ", node->details->shutdown ? "true" : "false"); fprintf(stream, "expected_up=\"%s\" ", node->details->expected_up ? "true" : "false"); fprintf(stream, "is_dc=\"%s\" ", node->details->is_dc ? "true" : "false"); fprintf(stream, "resources_running=\"%d\" ", g_list_length(node->details->running_rsc)); fprintf(stream, "type=\"%s\" ", node_type); if (group_by_node) { GListPtr lpc2 = NULL; fprintf(stream, ">\n"); for (lpc2 = node->details->running_rsc; lpc2 != NULL; lpc2 = lpc2->next) { resource_t *rsc = (resource_t *) lpc2->data; rsc->fns->print(rsc, " ", pe_print_xml | pe_print_rsconly, stream); } fprintf(stream, " \n"); } else { fprintf(stream, "/>\n"); } } fprintf(stream, " \n"); /*** RESOURCES ***/ if (group_by_node == FALSE || inactive_resources) { fprintf(stream, " \n"); for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { resource_t *rsc = (resource_t *) gIter->data; gboolean is_active = rsc->fns->active(rsc, TRUE); gboolean partially_active = rsc->fns->active(rsc, FALSE); if (is_set(rsc->flags, pe_rsc_orphan) && is_active == FALSE) { continue; } else if (group_by_node == FALSE) { if (partially_active || inactive_resources) { rsc->fns->print(rsc, " ", pe_print_xml, stream); } } else if (is_active == FALSE && inactive_resources) { rsc->fns->print(rsc, " ", pe_print_xml, stream); } } fprintf(stream, " \n"); } fprintf(stream, "\n"); fflush(stream); fclose(stream); return 0; } static int print_html_status(pe_working_set_t * data_set, const char *filename, gboolean web_cgi) { FILE *stream; GListPtr gIter = NULL; node_t *dc = NULL; static int updates = 0; char *filename_tmp = NULL; if (web_cgi) { stream = stdout; fprintf(stream, "Content-type: text/html\n\n"); } else { filename_tmp = crm_concat(filename, "tmp", '.'); stream = fopen(filename_tmp, "w"); if (stream == NULL) { crm_perror(LOG_ERR, "Cannot open %s for writing", filename_tmp); free(filename_tmp); return -1; } } updates++; dc = data_set->dc_node; fprintf(stream, ""); fprintf(stream, ""); fprintf(stream, "Cluster status"); /* content="%d;url=http://webdesign.about.com" */ fprintf(stream, "", reconnect_msec / 1000); fprintf(stream, ""); /*** SUMMARY ***/ fprintf(stream, "

Cluster summary

"); { char *now_str = NULL; time_t now = time(NULL); now_str = ctime(&now); now_str[24] = EOS; /* replace the newline */ fprintf(stream, "Last updated: %s
\n", now_str); } if (dc == NULL) { fprintf(stream, "Current DC: NONE
"); } else { fprintf(stream, "Current DC: %s (%s)
", dc->details->uname, dc->details->id); } fprintf(stream, "%d Nodes configured.
", g_list_length(data_set->nodes)); fprintf(stream, "%d Resources configured.
", count_resources(data_set, NULL)); /*** CONFIG ***/ fprintf(stream, "

Config Options

\n"); fprintf(stream, "\n"); fprintf(stream, "\n", is_set(data_set->flags, pe_flag_stonith_enabled) ? "enabled" : "disabled"); fprintf(stream, "\n", is_set(data_set->flags, pe_flag_symmetric_cluster) ? "" : "a-"); fprintf(stream, "\n
STONITH of failed nodes:%s
Cluster is:%ssymmetric
No Quorum Policy:"); switch (data_set->no_quorum_policy) { case no_quorum_freeze: fprintf(stream, "Freeze resources"); break; case no_quorum_stop: fprintf(stream, "Stop ALL resources"); break; case no_quorum_ignore: fprintf(stream, "Ignore"); break; case no_quorum_suicide: fprintf(stream, "Suicide"); break; } fprintf(stream, "\n
\n"); /*** NODE LIST ***/ fprintf(stream, "

Node List

\n"); fprintf(stream, "
    \n"); for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { node_t *node = (node_t *) gIter->data; fprintf(stream, "
  • "); if (node->details->standby_onfail && node->details->online) { fprintf(stream, "Node: %s (%s): %s", node->details->uname, node->details->id, "standby (on-fail)\n"); } else if (node->details->standby && node->details->online) { fprintf(stream, "Node: %s (%s): %s", node->details->uname, node->details->id, "standby\n"); } else if (node->details->standby) { fprintf(stream, "Node: %s (%s): %s", node->details->uname, node->details->id, "OFFLINE (standby)\n"); } else if (node->details->online) { fprintf(stream, "Node: %s (%s): %s", node->details->uname, node->details->id, "online\n"); } else { fprintf(stream, "Node: %s (%s): %s", node->details->uname, node->details->id, "OFFLINE\n"); } if (group_by_node) { GListPtr lpc2 = NULL; fprintf(stream, "
      \n"); for (lpc2 = node->details->running_rsc; lpc2 != NULL; lpc2 = lpc2->next) { resource_t *rsc = (resource_t *) lpc2->data; fprintf(stream, "
    • "); rsc->fns->print(rsc, NULL, pe_print_html | pe_print_rsconly, stream); fprintf(stream, "
    • \n"); } fprintf(stream, "
    \n"); } fprintf(stream, "
  • \n"); } fprintf(stream, "
\n"); if (group_by_node && inactive_resources) { fprintf(stream, "

Inactive Resources

\n"); } else if (group_by_node == FALSE) { fprintf(stream, "

Resource List

\n"); } if (group_by_node == FALSE || inactive_resources) { for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { resource_t *rsc = (resource_t *) gIter->data; gboolean is_active = rsc->fns->active(rsc, TRUE); gboolean partially_active = rsc->fns->active(rsc, FALSE); if (is_set(rsc->flags, pe_rsc_orphan) && is_active == FALSE) { continue; } else if (group_by_node == FALSE) { if (partially_active || inactive_resources) { rsc->fns->print(rsc, NULL, pe_print_html, stream); } } else if (is_active == FALSE && inactive_resources) { rsc->fns->print(rsc, NULL, pe_print_html, stream); } } } fprintf(stream, ""); fflush(stream); fclose(stream); if (!web_cgi) { if (rename(filename_tmp, filename) != 0) { crm_perror(LOG_ERR, "Unable to rename %s->%s", filename_tmp, filename); } free(filename_tmp); } return 0; } #if ENABLE_SNMP # include # include # include # include # include # include # define add_snmp_field(list, oid_string, value) do { \ oid name[MAX_OID_LEN]; \ size_t name_length = MAX_OID_LEN; \ if (snmp_parse_oid(oid_string, name, &name_length)) { \ int s_rc = snmp_add_var(list, name, name_length, 's', (value)); \ if(s_rc != 0) { \ crm_err("Could not add %s=%s rc=%d", oid_string, value, s_rc); \ } else { \ crm_trace("Added %s=%s", oid_string, value); \ } \ } else { \ crm_err("Could not parse OID: %s", oid_string); \ } \ } while(0) \ # define add_snmp_field_int(list, oid_string, value) do { \ oid name[MAX_OID_LEN]; \ size_t name_length = MAX_OID_LEN; \ if (snmp_parse_oid(oid_string, name, &name_length)) { \ if(NULL == snmp_pdu_add_variable( \ list, name, name_length, ASN_INTEGER, \ (u_char *) & value, sizeof(value))) { \ crm_err("Could not add %s=%d", oid_string, value); \ } else { \ crm_trace("Added %s=%d", oid_string, value); \ } \ } else { \ crm_err("Could not parse OID: %s", oid_string); \ } \ } while(0) \ static int snmp_input(int operation, netsnmp_session * session, int reqid, netsnmp_pdu * pdu, void *magic) { return 1; } static netsnmp_session * crm_snmp_init(const char *target, char *community) { static netsnmp_session *session = NULL; # ifdef NETSNMPV53 char target53[128]; snprintf(target53, sizeof(target53), "%s:162", target); # endif if (session) { return session; } if (target == NULL) { return NULL; } if (get_crm_log_level() > LOG_INFO) { char *debug_tokens = strdup("run:shell,snmptrap,tdomain"); debug_register_tokens(debug_tokens); snmp_set_do_debugging(1); } session = calloc(1, sizeof(netsnmp_session)); snmp_sess_init(session); session->version = SNMP_VERSION_2c; session->callback = snmp_input; session->callback_magic = NULL; if (community) { session->community_len = strlen(community); session->community = (unsigned char *)community; } session = snmp_add(session, # ifdef NETSNMPV53 netsnmp_tdomain_transport(target53, 0, "udp"), # else netsnmp_transport_open_client("snmptrap", target), # endif NULL, NULL); if (session == NULL) { snmp_sess_perror("Could not create snmp transport", session); } return session; } #endif static int send_snmp_trap(const char *node, const char *rsc, const char *task, int target_rc, int rc, int status, const char *desc) { int ret = 1; #if ENABLE_SNMP static oid snmptrap_oid[] = { 1, 3, 6, 1, 6, 3, 1, 1, 4, 1, 0 }; static oid sysuptime_oid[] = { 1, 3, 6, 1, 2, 1, 1, 3, 0 }; netsnmp_pdu *trap_pdu; netsnmp_session *session = crm_snmp_init(snmp_target, snmp_community); trap_pdu = snmp_pdu_create(SNMP_MSG_TRAP2); if (!trap_pdu) { crm_err("Failed to create SNMP notification"); return SNMPERR_GENERR; } if (1) { /* send uptime */ char csysuptime[20]; time_t now = time(NULL); sprintf(csysuptime, "%ld", now); snmp_add_var(trap_pdu, sysuptime_oid, sizeof(sysuptime_oid) / sizeof(oid), 't', csysuptime); } /* Indicate what the trap is by setting snmpTrapOid.0 */ ret = snmp_add_var(trap_pdu, snmptrap_oid, sizeof(snmptrap_oid) / sizeof(oid), 'o', snmp_crm_trap_oid); if (ret != 0) { crm_err("Failed set snmpTrapOid.0=%s", snmp_crm_trap_oid); return ret; } /* Add extries to the trap */ add_snmp_field(trap_pdu, snmp_crm_oid_rsc, rsc); add_snmp_field(trap_pdu, snmp_crm_oid_node, node); add_snmp_field(trap_pdu, snmp_crm_oid_task, task); add_snmp_field(trap_pdu, snmp_crm_oid_desc, desc); add_snmp_field_int(trap_pdu, snmp_crm_oid_rc, rc); add_snmp_field_int(trap_pdu, snmp_crm_oid_trc, target_rc); add_snmp_field_int(trap_pdu, snmp_crm_oid_status, status); /* Send and cleanup */ ret = snmp_send(session, trap_pdu); if (ret == 0) { /* error */ snmp_sess_perror("Could not send SNMP trap", session); snmp_free_pdu(trap_pdu); ret = SNMPERR_GENERR; } else { ret = SNMPERR_SUCCESS; } #else crm_err("Sending SNMP traps is not supported by this installation"); #endif return ret; } #if ENABLE_ESMTP # include # include static void print_recipient_status(smtp_recipient_t recipient, const char *mailbox, void *arg) { const smtp_status_t *status; status = smtp_recipient_status(recipient); printf("%s: %d %s", mailbox, status->code, status->text); } static void event_cb(smtp_session_t session, int event_no, void *arg, ...) { int *ok; va_list alist; va_start(alist, arg); switch (event_no) { case SMTP_EV_CONNECT: case SMTP_EV_MAILSTATUS: case SMTP_EV_RCPTSTATUS: case SMTP_EV_MESSAGEDATA: case SMTP_EV_MESSAGESENT: case SMTP_EV_DISCONNECT: break; case SMTP_EV_WEAK_CIPHER:{ int bits = va_arg(alist, long); ok = va_arg(alist, int *); crm_debug("SMTP_EV_WEAK_CIPHER, bits=%d - accepted.", bits); *ok = 1; break; } case SMTP_EV_STARTTLS_OK: crm_debug("SMTP_EV_STARTTLS_OK - TLS started here."); break; case SMTP_EV_INVALID_PEER_CERTIFICATE:{ long vfy_result = va_arg(alist, long); ok = va_arg(alist, int *); /* There is a table in handle_invalid_peer_certificate() of mail-file.c */ crm_err("SMTP_EV_INVALID_PEER_CERTIFICATE: %ld", vfy_result); *ok = 1; break; } case SMTP_EV_NO_PEER_CERTIFICATE: ok = va_arg(alist, int *); crm_debug("SMTP_EV_NO_PEER_CERTIFICATE - accepted."); *ok = 1; break; case SMTP_EV_WRONG_PEER_CERTIFICATE: ok = va_arg(alist, int *); crm_debug("SMTP_EV_WRONG_PEER_CERTIFICATE - accepted."); *ok = 1; break; case SMTP_EV_NO_CLIENT_CERTIFICATE: ok = va_arg(alist, int *); crm_debug("SMTP_EV_NO_CLIENT_CERTIFICATE - accepted."); *ok = 1; break; default: crm_debug("Got event: %d - ignored.\n", event_no); } va_end(alist); } #endif #define BODY_MAX 2048 #if ENABLE_ESMTP static void crm_smtp_debug(const char *buf, int buflen, int writing, void *arg) { char type = 0; int lpc = 0, last = 0, level = *(int *)arg; if (writing == SMTP_CB_HEADERS) { type = 'H'; } else if (writing) { type = 'C'; } else { type = 'S'; } for (; lpc < buflen; lpc++) { switch (buf[lpc]) { case 0: case '\n': if (last > 0) { do_crm_log(level, " %.*s", lpc - last, buf + last); } else { do_crm_log(level, "%c: %.*s", type, lpc - last, buf + last); } last = lpc + 1; break; } } } #endif static int send_custom_trap(const char *node, const char *rsc, const char *task, int target_rc, int rc, int status, const char *desc) { pid_t pid; /*setenv needs chars, these are ints */ char *rc_s = crm_itoa(rc); char *status_s = crm_itoa(status); char *target_rc_s = crm_itoa(target_rc); crm_debug("Sending external notification to '%s' via '%s'", external_recipient, external_agent); setenv("CRM_notify_recipient", external_recipient, 1); setenv("CRM_notify_node", node, 1); setenv("CRM_notify_rsc", rsc, 1); setenv("CRM_notify_task", task, 1); setenv("CRM_notify_desc", desc, 1); setenv("CRM_notify_rc", rc_s, 1); setenv("CRM_notify_target_rc", target_rc_s, 1); setenv("CRM_notify_status", status_s, 1); pid = fork(); if (pid == -1) { crm_perror(LOG_ERR, "notification fork() failed."); } if (pid == 0) { /* crm_debug("notification: I am the child. Executing the nofitication program."); */ execl(external_agent, external_agent, NULL); } crm_trace("Finished running custom notification program '%s'.", external_agent); free(target_rc_s); free(status_s); free(rc_s); return 0; } static int send_smtp_trap(const char *node, const char *rsc, const char *task, int target_rc, int rc, int status, const char *desc) { #if ENABLE_ESMTP smtp_session_t session; smtp_message_t message; auth_context_t authctx; struct sigaction sa; int len = 20; int noauth = 1; int smtp_debug = LOG_DEBUG; char crm_mail_body[BODY_MAX]; char *crm_mail_subject = NULL; memset(&sa, 0, sizeof(struct sigaction)); if (node == NULL) { node = "-"; } if (rsc == NULL) { rsc = "-"; } if (desc == NULL) { desc = "-"; } if (crm_mail_to == NULL) { return 1; } if (crm_mail_host == NULL) { crm_mail_host = "localhost:25"; } if (crm_mail_prefix == NULL) { crm_mail_prefix = "Cluster notification"; } crm_debug("Sending '%s' mail to %s via %s", crm_mail_prefix, crm_mail_to, crm_mail_host); len += strlen(crm_mail_prefix); len += strlen(task); len += strlen(rsc); len += strlen(node); len += strlen(desc); len++; crm_mail_subject = calloc(1, len); snprintf(crm_mail_subject, len, "%s - %s event for %s on %s: %s\r\n", crm_mail_prefix, task, rsc, node, desc); len = 0; len += snprintf(crm_mail_body + len, BODY_MAX - len, "\r\n%s\r\n", crm_mail_prefix); len += snprintf(crm_mail_body + len, BODY_MAX - len, "====\r\n\r\n"); if (rc == target_rc) { len += snprintf(crm_mail_body + len, BODY_MAX - len, "Completed operation %s for resource %s on %s\r\n", task, rsc, node); } else { len += snprintf(crm_mail_body + len, BODY_MAX - len, "Operation %s for resource %s on %s failed: %s\r\n", task, rsc, node, desc); } len += snprintf(crm_mail_body + len, BODY_MAX - len, "\r\nDetails:\r\n"); len += snprintf(crm_mail_body + len, BODY_MAX - len, "\toperation status: (%d) %s\r\n", status, services_lrm_status_str(status)); if (status == PCMK_LRM_OP_DONE) { len += snprintf(crm_mail_body + len, BODY_MAX - len, "\tscript returned: (%d) %s\r\n", rc, lrmd_event_rc2str(rc)); len += snprintf(crm_mail_body + len, BODY_MAX - len, "\texpected return value: (%d) %s\r\n", target_rc, lrmd_event_rc2str(target_rc)); } auth_client_init(); session = smtp_create_session(); message = smtp_add_message(session); smtp_starttls_enable(session, Starttls_ENABLED); sa.sa_handler = SIG_IGN; sigemptyset(&sa.sa_mask); sa.sa_flags = 0; sigaction(SIGPIPE, &sa, NULL); smtp_set_server(session, crm_mail_host); authctx = auth_create_context(); auth_set_mechanism_flags(authctx, AUTH_PLUGIN_PLAIN, 0); smtp_set_eventcb(session, event_cb, NULL); /* Now tell libESMTP it can use the SMTP AUTH extension. */ if (!noauth) { crm_debug("Adding authentication context"); smtp_auth_set_context(session, authctx); } if (crm_mail_from == NULL) { struct utsname us; char auto_from[BODY_MAX]; CRM_ASSERT(uname(&us) == 0); snprintf(auto_from, BODY_MAX, "crm_mon@%s", us.nodename); smtp_set_reverse_path(message, auto_from); } else { /* NULL is ok */ smtp_set_reverse_path(message, crm_mail_from); } smtp_set_header(message, "To", NULL /*phrase */ , NULL /*addr */ ); /* "Phrase" */ smtp_add_recipient(message, crm_mail_to); /* Set the Subject: header and override any subject line in the message headers. */ smtp_set_header(message, "Subject", crm_mail_subject); smtp_set_header_option(message, "Subject", Hdr_OVERRIDE, 1); smtp_set_message_str(message, crm_mail_body); smtp_set_monitorcb(session, crm_smtp_debug, &smtp_debug, 1); if (smtp_start_session(session)) { char buf[128]; int rc = smtp_errno(); crm_err("SMTP server problem: %s (%d)", smtp_strerror(rc, buf, sizeof buf), rc); } else { char buf[128]; int rc = smtp_errno(); const smtp_status_t *smtp_status = smtp_message_transfer_status(message); if (rc != 0) { crm_err("SMTP server problem: %s (%d)", smtp_strerror(rc, buf, sizeof buf), rc); } crm_info("Send status: %d %s", smtp_status->code, crm_str(smtp_status->text)); smtp_enumerate_recipients(message, print_recipient_status, NULL); } smtp_destroy_session(session); auth_destroy_context(authctx); auth_client_exit(); #endif return 0; } static void handle_rsc_op(xmlNode * rsc_op) { int rc = -1; int status = -1; int action = -1; int interval = 0; int target_rc = -1; int transition_num = -1; gboolean notify = TRUE; char *rsc = NULL; char *task = NULL; const char *desc = NULL; const char *node = NULL; const char *magic = NULL; const char *id = crm_element_value(rsc_op, XML_LRM_ATTR_TASK_KEY); char *update_te_uuid = NULL; xmlNode *n = rsc_op; if (id == NULL) { /* Compatability with <= 1.1.5 */ id = ID(rsc_op); } magic = crm_element_value(rsc_op, XML_ATTR_TRANSITION_MAGIC); if (magic == NULL) { /* non-change */ return; } if (FALSE == decode_transition_magic(magic, &update_te_uuid, &transition_num, &action, &status, &rc, &target_rc)) { crm_err("Invalid event %s detected for %s", magic, id); return; } if (parse_op_key(id, &rsc, &task, &interval) == FALSE) { crm_err("Invalid event detected for %s", id); goto bail; } while (n != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(n))) { n = n->parent; } node = crm_element_value(n, XML_ATTR_UNAME); if (node == NULL) { node = ID(n); } if (node == NULL) { crm_err("No node detected for event %s (%s)", magic, id); goto bail; } /* look up where we expected it to be? */ desc = pcmk_strerror(pcmk_ok); if (status == PCMK_LRM_OP_DONE && target_rc == rc) { crm_notice("%s of %s on %s completed: %s", task, rsc, node, desc); if (rc == PCMK_EXECRA_NOT_RUNNING) { notify = FALSE; } } else if (status == PCMK_LRM_OP_DONE) { desc = lrmd_event_rc2str(rc); crm_warn("%s of %s on %s failed: %s", task, rsc, node, desc); } else { desc = services_lrm_status_str(status); crm_warn("%s of %s on %s failed: %s", task, rsc, node, desc); } if (notify && snmp_target) { send_snmp_trap(node, rsc, task, target_rc, rc, status, desc); } if (notify && crm_mail_to) { send_smtp_trap(node, rsc, task, target_rc, rc, status, desc); } if (notify && external_agent) { send_custom_trap(node, rsc, task, target_rc, rc, status, desc); } bail: free(update_te_uuid); free(rsc); free(task); } void crm_diff_update(const char *event, xmlNode * msg) { int rc = -1; long now = time(NULL); const char *op = NULL; print_dot(); if (current_cib != NULL) { xmlNode *cib_last = current_cib; current_cib = NULL; rc = cib_apply_patch_event(msg, cib_last, ¤t_cib, LOG_DEBUG); free_xml(cib_last); switch(rc) { case pcmk_err_diff_resync: case pcmk_err_diff_failed: crm_warn("[%s] %s Patch aborted: %s (%d)", event, op, pcmk_strerror(rc), rc); case pcmk_ok: break; default: crm_warn("[%s] %s ABORTED: %s (%d)", event, op, pcmk_strerror(rc), rc); return; } } if (current_cib == NULL) { current_cib = get_cib_copy(cib); } if (crm_mail_to || snmp_target || external_agent) { /* Process operation updates */ xmlXPathObject *xpathObj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_LRM_TAG_RSC_OP); if (xpathObj && xpathObj->nodesetval->nodeNr > 0) { int lpc = 0, max = xpathObj->nodesetval->nodeNr; for (lpc = 0; lpc < max; lpc++) { xmlNode *rsc_op = getXpathResult(xpathObj, lpc); handle_rsc_op(rsc_op); } } if (xpathObj) { xmlXPathFreeObject(xpathObj); } } if ((now - last_refresh) > (reconnect_msec / 1000)) { /* Force a refresh */ mon_refresh_display(NULL); } else { mainloop_set_trigger(refresh_trigger); } } gboolean mon_refresh_display(gpointer user_data) { xmlNode *cib_copy = copy_xml(current_cib); pe_working_set_t data_set; last_refresh = time(NULL); if (cli_config_update(&cib_copy, NULL, FALSE) == FALSE) { if (cib) { cib->cmds->signoff(cib); } print_as("Upgrade failed: %s", pcmk_strerror(-pcmk_err_dtd_validation)); if (as_console) { sleep(2); } clean_up(EX_USAGE); return FALSE; } set_working_set_defaults(&data_set); data_set.input = cib_copy; cluster_status(&data_set); if (as_html_file || web_cgi) { if (print_html_status(&data_set, as_html_file, web_cgi) != 0) { fprintf(stderr, "Critical: Unable to output html file\n"); clean_up(EX_USAGE); } } else if (as_xml) { if (print_xml_status(&data_set) != 0) { fprintf(stderr, "Critical: Unable to output xml file\n"); clean_up(EX_USAGE); } } else if (daemonize) { /* do nothing */ } else if (simple_status) { print_simple_status(&data_set); if (has_warnings) { clean_up(EX_USAGE); } } else { print_status(&data_set); } cleanup_calculations(&data_set); return TRUE; } void mon_st_callback(stonith_t *st, stonith_event_t *e) { char *desc = g_strdup_printf( "Operation %s requested by %s for peer %s: %s (ref=%s)", e->operation, e->origin, e->target, pcmk_strerror(e->result), e->id); if (snmp_target) { send_snmp_trap(e->target, NULL, e->operation, pcmk_ok, e->result, 0, desc); } if (crm_mail_to) { send_smtp_trap(e->target, NULL, e->operation, pcmk_ok, e->result, 0, desc); } if (external_agent) { send_custom_trap(e->target, NULL, e->operation, pcmk_ok, e->result, 0, desc); } g_free(desc); } /* * De-init ncurses, signoff from the CIB and deallocate memory. */ void clean_up(int rc) { #if ENABLE_SNMP netsnmp_session *session = crm_snmp_init(NULL, NULL); if (session) { snmp_close(session); snmp_shutdown("snmpapp"); } #endif #if CURSES_ENABLED if (as_console) { as_console = FALSE; echo(); nocbreak(); endwin(); } #endif if (cib != NULL) { cib->cmds->signoff(cib); cib_delete(cib); cib = NULL; } free(as_html_file); free(xml_file); free(pid_file); if (rc >= 0) { exit(rc); } return; }